Fix: Sidebar scrollable + add Eltern-Portal nav link
overflow-hidden → overflow-y-auto so all nav items are reachable. Added /parent (Eltern-Portal) link with people icon. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,498 @@
|
||||
"""
|
||||
Cell-grid construction v2 (hybrid: broad columns via word lookup, narrow via cell-crop).
|
||||
Extracted from cv_cell_grid.py.
|
||||
Lizenz: Apache 2.0 — DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
from cv_ocr_engines import (
|
||||
RAPIDOCR_AVAILABLE,
|
||||
_assign_row_words_to_columns,
|
||||
_clean_cell_text,
|
||||
_clean_cell_text_lite,
|
||||
_words_to_reading_order_text,
|
||||
_words_to_spaced_text,
|
||||
ocr_region_lighton,
|
||||
ocr_region_rapid,
|
||||
ocr_region_trocr,
|
||||
)
|
||||
from cv_cell_grid_helpers import (
|
||||
_MIN_WORD_CONF,
|
||||
_ensure_minimum_crop_size,
|
||||
_heal_row_gaps,
|
||||
_is_artifact_row,
|
||||
_select_psm_for_column,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _ocr_cell_crop — isolated cell-crop OCR for v2 hybrid mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ocr_cell_crop(
|
||||
row_idx: int,
|
||||
col_idx: int,
|
||||
row: RowGeometry,
|
||||
col: PageRegion,
|
||||
ocr_img: np.ndarray,
|
||||
img_bgr: Optional[np.ndarray],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
engine_name: str,
|
||||
lang: str,
|
||||
lang_map: Dict[str, str],
|
||||
) -> Dict[str, Any]:
|
||||
"""OCR a single cell by cropping the exact column x row intersection.
|
||||
|
||||
No padding beyond cell boundaries -> no neighbour bleeding.
|
||||
"""
|
||||
# Display bbox: exact column x row intersection
|
||||
disp_x = col.x
|
||||
disp_y = row.y
|
||||
disp_w = col.width
|
||||
disp_h = row.height
|
||||
|
||||
# Crop boundaries: add small internal padding (3px each side) to avoid
|
||||
# clipping characters near column/row edges (e.g. parentheses, descenders).
|
||||
# Stays within image bounds but may extend slightly beyond strict cell.
|
||||
# 3px is small enough to avoid neighbour content at typical scan DPI (200-300).
|
||||
_PAD = 3
|
||||
cx = max(0, disp_x - _PAD)
|
||||
cy = max(0, disp_y - _PAD)
|
||||
cx2 = min(img_w, disp_x + disp_w + _PAD)
|
||||
cy2 = min(img_h, disp_y + disp_h + _PAD)
|
||||
cw = cx2 - cx
|
||||
ch = cy2 - cy
|
||||
|
||||
empty_cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': '',
|
||||
'confidence': 0.0,
|
||||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||||
'bbox_pct': {
|
||||
'x': round(disp_x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(disp_y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(disp_w / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(disp_h / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'cell_crop_v2',
|
||||
'is_bold': False,
|
||||
}
|
||||
|
||||
if cw <= 0 or ch <= 0:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: zero-size crop (%dx%d)", row_idx, col_idx, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Pixel-density check: skip truly empty cells ---
|
||||
if ocr_img is not None:
|
||||
crop = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
if dark_ratio < 0.005:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: skip empty (dark_ratio=%.4f, crop=%dx%d)",
|
||||
row_idx, col_idx, dark_ratio, cw, ch)
|
||||
return empty_cell
|
||||
|
||||
# --- Prepare crop for OCR ---
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
used_engine = 'cell_crop_v2'
|
||||
|
||||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||||
words = ocr_region_trocr(img_bgr, cell_region,
|
||||
handwritten=(engine_name == "trocr-handwritten"))
|
||||
elif engine_name == "lighton" and img_bgr is not None:
|
||||
cell_region = PageRegion(type=col.type, x=cx, y=cy, width=cw, height=ch)
|
||||
words = ocr_region_lighton(img_bgr, cell_region)
|
||||
elif engine_name == "rapid" and img_bgr is not None:
|
||||
# Upscale small BGR crops for RapidOCR.
|
||||
bgr_crop = img_bgr[cy:cy + ch, cx:cx + cw]
|
||||
if bgr_crop.size == 0:
|
||||
words = []
|
||||
else:
|
||||
crop_h, crop_w = bgr_crop.shape[:2]
|
||||
if crop_h < 80:
|
||||
# Force 3x upscale for short rows — small chars need more pixels
|
||||
scale = 3.0
|
||||
bgr_up = cv2.resize(bgr_crop, None, fx=scale, fy=scale,
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
else:
|
||||
bgr_up = _ensure_minimum_crop_size(bgr_crop, min_dim=150, max_scale=3)
|
||||
up_h, up_w = bgr_up.shape[:2]
|
||||
scale_x = up_w / max(crop_w, 1)
|
||||
scale_y = up_h / max(crop_h, 1)
|
||||
was_scaled = (up_w != crop_w or up_h != crop_h)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: rapid %dx%d -> %dx%d (scale=%.1fx)",
|
||||
row_idx, col_idx, crop_w, crop_h, up_w, up_h, scale_y)
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
words = ocr_region_rapid(bgr_up, tmp_region)
|
||||
# Remap positions back to original image coords
|
||||
if words and was_scaled:
|
||||
for w in words:
|
||||
w['left'] = int(w['left'] / scale_x) + cx
|
||||
w['top'] = int(w['top'] / scale_y) + cy
|
||||
w['width'] = int(w['width'] / scale_x)
|
||||
w['height'] = int(w['height'] / scale_y)
|
||||
elif words:
|
||||
for w in words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
else:
|
||||
# Tesseract: upscale tiny crops for better recognition
|
||||
if ocr_img is not None:
|
||||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||||
up_h, up_w = upscaled.shape[:2]
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=psm)
|
||||
# Remap word positions back to original image coordinates
|
||||
if words and (up_w != cw or up_h != ch):
|
||||
sx = cw / max(up_w, 1)
|
||||
sy = ch / max(up_h, 1)
|
||||
for w in words:
|
||||
w['left'] = int(w['left'] * sx) + cx
|
||||
w['top'] = int(w['top'] * sy) + cy
|
||||
w['width'] = int(w['width'] * sx)
|
||||
w['height'] = int(w['height'] * sy)
|
||||
elif words:
|
||||
for w in words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
else:
|
||||
words = []
|
||||
|
||||
# Filter low-confidence words
|
||||
if words:
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
if words:
|
||||
y_tol = max(15, ch)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR raw text=%r conf=%.1f nwords=%d crop=%dx%d psm=%s engine=%s",
|
||||
row_idx, col_idx, text, avg_conf, len(words), cw, ch, psm, engine_name)
|
||||
else:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: OCR returned NO words (crop=%dx%d psm=%s engine=%s)",
|
||||
row_idx, col_idx, cw, ch, psm, engine_name)
|
||||
|
||||
# --- PSM 7 fallback for still-empty Tesseract cells ---
|
||||
if not text.strip() and engine_name == "tesseract" and ocr_img is not None:
|
||||
crop_slice = ocr_img[cy:cy + ch, cx:cx + cw]
|
||||
upscaled = _ensure_minimum_crop_size(crop_slice)
|
||||
up_h, up_w = upscaled.shape[:2]
|
||||
tmp_region = PageRegion(type=col.type, x=0, y=0, width=up_w, height=up_h)
|
||||
psm7_words = ocr_region(upscaled, tmp_region, lang=cell_lang, psm=7)
|
||||
if psm7_words:
|
||||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if psm7_words:
|
||||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||||
if p7_text.strip():
|
||||
text = p7_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||||
)
|
||||
used_engine = 'cell_crop_v2_psm7'
|
||||
# Remap PSM7 word positions back to original image coords
|
||||
if up_w != cw or up_h != ch:
|
||||
sx = cw / max(up_w, 1)
|
||||
sy = ch / max(up_h, 1)
|
||||
for w in psm7_words:
|
||||
w['left'] = int(w['left'] * sx) + cx
|
||||
w['top'] = int(w['top'] * sy) + cy
|
||||
w['width'] = int(w['width'] * sx)
|
||||
w['height'] = int(w['height'] * sy)
|
||||
else:
|
||||
for w in psm7_words:
|
||||
w['left'] += cx
|
||||
w['top'] += cy
|
||||
words = psm7_words
|
||||
|
||||
# --- Noise filter ---
|
||||
if text.strip():
|
||||
pre_filter = text
|
||||
text = _clean_cell_text_lite(text)
|
||||
if not text:
|
||||
logger.debug("_ocr_cell_crop R%02d_C%d: _clean_cell_text_lite REMOVED %r",
|
||||
row_idx, col_idx, pre_filter)
|
||||
avg_conf = 0.0
|
||||
|
||||
result = dict(empty_cell)
|
||||
result['text'] = text
|
||||
result['confidence'] = avg_conf
|
||||
result['ocr_engine'] = used_engine
|
||||
|
||||
# Store individual word bounding boxes (absolute image coordinates)
|
||||
# for pixel-accurate overlay positioning in the frontend.
|
||||
if words and text.strip():
|
||||
result['word_boxes'] = [
|
||||
{
|
||||
'text': w.get('text', ''),
|
||||
'left': w['left'],
|
||||
'top': w['top'],
|
||||
'width': w['width'],
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
}
|
||||
for w in words
|
||||
if w.get('text', '').strip()
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Threshold: columns narrower than this (% of image width) use single-cell
|
||||
# crop OCR instead of full-page word assignment.
|
||||
_NARROW_COL_THRESHOLD_PCT = 15.0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_cell_grid_v2 — hybrid grid builder (current default)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cell_grid_v2(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
skip_heal_gaps: bool = False,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Hybrid Grid: full-page OCR for broad columns, cell-crop for narrow ones.
|
||||
|
||||
Drop-in replacement for build_cell_grid() -- same signature & return type.
|
||||
|
||||
Strategy:
|
||||
- Broad columns (>15% image width): Use pre-assigned full-page Tesseract
|
||||
words (from row.words). Handles IPA brackets, punctuation, sentence
|
||||
continuity correctly.
|
||||
- Narrow columns (<15% image width): Use isolated cell-crop OCR to prevent
|
||||
neighbour bleeding from adjacent broad columns.
|
||||
"""
|
||||
engine_name = "tesseract"
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "rapid" and RAPIDOCR_AVAILABLE:
|
||||
engine_name = "rapid"
|
||||
|
||||
logger.info(f"build_cell_grid_v2: using OCR engine '{engine_name}' (hybrid mode)")
|
||||
|
||||
# Filter to content rows only
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows found")
|
||||
return [], []
|
||||
|
||||
# Filter phantom rows (word_count=0) and artifact rows
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid_v2: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows with words found")
|
||||
return [], []
|
||||
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid_v2: skipped {artifact_skipped} artifact rows")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid_v2: no content rows after artifact filtering")
|
||||
return [], []
|
||||
|
||||
# Filter columns
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
||||
'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
logger.warning("build_cell_grid_v2: no usable columns found")
|
||||
return [], []
|
||||
|
||||
# Heal row gaps -- use header/footer boundaries
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
if header_rows:
|
||||
top_bound = max(r.y + r.height for r in header_rows)
|
||||
else:
|
||||
top_bound = content_rows[0].y
|
||||
if footer_rows:
|
||||
bottom_bound = min(r.y for r in footer_rows)
|
||||
else:
|
||||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||||
|
||||
# skip_heal_gaps: When True, keep cell positions at their exact row geometry
|
||||
# positions without expanding to fill gaps from removed rows.
|
||||
if not skip_heal_gaps:
|
||||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
||||
for ci, c in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
# --- Classify columns as broad vs narrow ---
|
||||
narrow_col_indices = set()
|
||||
for ci, col in enumerate(relevant_cols):
|
||||
col_pct = (col.width / img_w * 100) if img_w > 0 else 0
|
||||
if col_pct < _NARROW_COL_THRESHOLD_PCT:
|
||||
narrow_col_indices.add(ci)
|
||||
|
||||
broad_col_count = len(relevant_cols) - len(narrow_col_indices)
|
||||
logger.info(f"build_cell_grid_v2: {broad_col_count} broad columns (full-page), "
|
||||
f"{len(narrow_col_indices)} narrow columns (cell-crop)")
|
||||
|
||||
# --- Phase 1: Broad columns via full-page word assignment ---
|
||||
cells: List[Dict[str, Any]] = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
# Assign full-page words to columns for this row
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx not in narrow_col_indices:
|
||||
# BROAD column: use pre-assigned full-page words
|
||||
words = col_words.get(col_idx, [])
|
||||
# Filter low-confidence words
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
# Single full-width column (box sub-session): preserve spacing
|
||||
is_single_full_column = (
|
||||
len(relevant_cols) == 1
|
||||
and img_w > 0
|
||||
and relevant_cols[0].width / img_w > 0.9
|
||||
)
|
||||
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
if is_single_full_column:
|
||||
text = _words_to_spaced_text(words, y_tolerance_px=y_tol)
|
||||
logger.info(f"R{row_idx:02d}: {len(words)} words, "
|
||||
f"text={text!r:.100}")
|
||||
else:
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
if is_single_full_column:
|
||||
logger.info(f"R{row_idx:02d}: 0 words (row has "
|
||||
f"{row.word_count} total, y={row.y}..{row.y+row.height})")
|
||||
|
||||
# Apply noise filter -- but NOT for single-column sub-sessions
|
||||
if not is_single_full_column:
|
||||
text = _clean_cell_text(text)
|
||||
|
||||
cell = {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {
|
||||
'x': col.x, 'y': row.y,
|
||||
'w': col.width, 'h': row.height,
|
||||
},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(row.y / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(col.width / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(row.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
'is_bold': False,
|
||||
}
|
||||
# Store word bounding boxes for pixel-accurate overlay
|
||||
if words and text.strip():
|
||||
cell['word_boxes'] = [
|
||||
{
|
||||
'text': w.get('text', ''),
|
||||
'left': w['left'],
|
||||
'top': w['top'],
|
||||
'width': w['width'],
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
}
|
||||
for w in words
|
||||
if w.get('text', '').strip()
|
||||
]
|
||||
cells.append(cell)
|
||||
|
||||
# --- Phase 2: Narrow columns via cell-crop OCR (parallel) ---
|
||||
narrow_tasks = []
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
if col_idx in narrow_col_indices:
|
||||
narrow_tasks.append((row_idx, col_idx, row, col))
|
||||
|
||||
if narrow_tasks:
|
||||
max_workers = 4 if engine_name == "tesseract" else 2
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(
|
||||
_ocr_cell_crop,
|
||||
ri, ci, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
): (ri, ci)
|
||||
for ri, ci, row, col in narrow_tasks
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
cell = future.result()
|
||||
cells.append(cell)
|
||||
except Exception as e:
|
||||
ri, ci = futures[future]
|
||||
logger.error(f"build_cell_grid_v2: narrow cell R{ri:02d}_C{ci} failed: {e}")
|
||||
|
||||
# Sort cells by (row_index, col_index)
|
||||
cells.sort(key=lambda c: (c['row_index'], c['col_index']))
|
||||
|
||||
# Remove all-empty rows
|
||||
rows_with_text: set = set()
|
||||
for cell in cells:
|
||||
if cell['text'].strip():
|
||||
rows_with_text.add(cell['row_index'])
|
||||
before_filter = len(cells)
|
||||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||||
if empty_rows_removed > 0:
|
||||
logger.info(f"build_cell_grid_v2: removed {empty_rows_removed} all-empty rows")
|
||||
|
||||
logger.info(f"build_cell_grid_v2: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name} (hybrid)")
|
||||
|
||||
return cells, columns_meta
|
||||
@@ -0,0 +1,60 @@
|
||||
"""
|
||||
Cell-grid construction (v2 + legacy), vocab conversion, and word-grid OCR.
|
||||
|
||||
Re-export hub — all public and private names remain importable from here
|
||||
for backward compatibility. The actual implementations live in:
|
||||
|
||||
cv_cell_grid_helpers.py — shared helpers (_heal_row_gaps, _is_artifact_row, ...)
|
||||
cv_cell_grid_build.py — v2 hybrid grid (build_cell_grid_v2, _ocr_cell_crop)
|
||||
cv_cell_grid_legacy.py — deprecated v1 grid (build_cell_grid, _ocr_single_cell)
|
||||
cv_cell_grid_streaming.py — streaming variants (build_cell_grid_v2_streaming, ...)
|
||||
cv_cell_grid_merge.py — row-merging logic (_merge_wrapped_rows, ...)
|
||||
cv_cell_grid_vocab.py — vocab extraction (_cells_to_vocab_entries, build_word_grid)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
# --- Helpers ---
|
||||
from cv_cell_grid_helpers import ( # noqa: F401
|
||||
_MIN_WORD_CONF,
|
||||
_compute_cell_padding,
|
||||
_ensure_minimum_crop_size,
|
||||
_heal_row_gaps,
|
||||
_is_artifact_row,
|
||||
_select_psm_for_column,
|
||||
)
|
||||
|
||||
# --- v2 build (current default) ---
|
||||
from cv_cell_grid_build import ( # noqa: F401
|
||||
_NARROW_COL_THRESHOLD_PCT,
|
||||
_ocr_cell_crop,
|
||||
build_cell_grid_v2,
|
||||
)
|
||||
|
||||
# --- Legacy build (DEPRECATED) ---
|
||||
from cv_cell_grid_legacy import ( # noqa: F401
|
||||
_ocr_single_cell,
|
||||
build_cell_grid,
|
||||
)
|
||||
|
||||
# --- Streaming variants ---
|
||||
from cv_cell_grid_streaming import ( # noqa: F401
|
||||
build_cell_grid_streaming,
|
||||
build_cell_grid_v2_streaming,
|
||||
)
|
||||
|
||||
# --- Row merging ---
|
||||
from cv_cell_grid_merge import ( # noqa: F401
|
||||
_PHONETIC_ONLY_RE,
|
||||
_is_phonetic_only_text,
|
||||
_merge_continuation_rows,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_wrapped_rows,
|
||||
)
|
||||
|
||||
# --- Vocab extraction ---
|
||||
from cv_cell_grid_vocab import ( # noqa: F401
|
||||
_cells_to_vocab_entries,
|
||||
build_word_grid,
|
||||
)
|
||||
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
Shared helpers for cell-grid construction (v2 + legacy).
|
||||
|
||||
Extracted from cv_cell_grid.py — used by both cv_cell_grid_build and
|
||||
cv_cell_grid_legacy.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
# Minimum OCR word confidence to keep (used across multiple functions)
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _compute_cell_padding(col_width: int, img_w: int) -> int:
|
||||
"""Adaptive padding for OCR crops based on column width.
|
||||
|
||||
Narrow columns (page_ref, marker) need more surrounding context so
|
||||
Tesseract can segment characters correctly. Wide columns keep the
|
||||
minimal 4 px padding to avoid pulling in neighbours.
|
||||
"""
|
||||
col_pct = col_width / img_w * 100 if img_w > 0 else 100
|
||||
if col_pct < 5:
|
||||
return max(20, col_width // 2)
|
||||
if col_pct < 10:
|
||||
return max(12, col_width // 4)
|
||||
if col_pct < 15:
|
||||
return 8
|
||||
return 4
|
||||
|
||||
|
||||
def _ensure_minimum_crop_size(crop: np.ndarray, min_dim: int = 150,
|
||||
max_scale: int = 3) -> np.ndarray:
|
||||
"""Upscale tiny crops so Tesseract gets enough pixel data.
|
||||
|
||||
If either dimension is below *min_dim*, the crop is bicubic-upscaled
|
||||
so the smallest dimension reaches *min_dim* (capped at *max_scale* x).
|
||||
"""
|
||||
h, w = crop.shape[:2]
|
||||
if h >= min_dim and w >= min_dim:
|
||||
return crop
|
||||
scale = min(max_scale, max(min_dim / max(h, 1), min_dim / max(w, 1)))
|
||||
if scale <= 1.0:
|
||||
return crop
|
||||
new_w = int(w * scale)
|
||||
new_h = int(h * scale)
|
||||
return cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
|
||||
def _select_psm_for_column(col_type: str, col_width: int,
|
||||
row_height: int) -> int:
|
||||
"""Choose the best Tesseract PSM for a given column geometry.
|
||||
|
||||
- page_ref columns are almost always single short tokens -> PSM 8
|
||||
- Very narrow or short cells -> PSM 7 (single text line)
|
||||
- Everything else -> PSM 6 (uniform block)
|
||||
"""
|
||||
if col_type in ('page_ref', 'marker'):
|
||||
return 8 # single word
|
||||
if col_width < 100 or row_height < 30:
|
||||
return 7 # single line
|
||||
return 6 # uniform block
|
||||
|
||||
|
||||
def _is_artifact_row(row: RowGeometry) -> bool:
|
||||
"""Return True if this row contains only scan artifacts, not real text.
|
||||
|
||||
Artifact rows (scanner shadows, noise) typically produce only single-character
|
||||
detections. A real content row always has at least one token with 2+ characters.
|
||||
"""
|
||||
if row.word_count == 0:
|
||||
return True
|
||||
texts = [w.get('text', '').strip() for w in row.words]
|
||||
return all(len(t) <= 1 for t in texts)
|
||||
|
||||
|
||||
def _heal_row_gaps(
|
||||
rows: List[RowGeometry],
|
||||
top_bound: int,
|
||||
bottom_bound: int,
|
||||
) -> None:
|
||||
"""Expand row y/height to fill vertical gaps caused by removed adjacent rows.
|
||||
|
||||
After filtering out empty or artifact rows, remaining content rows may have
|
||||
gaps between them where the removed rows used to be. This function mutates
|
||||
each row to extend upward/downward to the midpoint of such gaps so that
|
||||
OCR crops cover the full available content area.
|
||||
|
||||
The first row always extends to top_bound; the last row to bottom_bound.
|
||||
"""
|
||||
if not rows:
|
||||
return
|
||||
rows.sort(key=lambda r: r.y)
|
||||
n = len(rows)
|
||||
orig = [(r.y, r.y + r.height) for r in rows] # snapshot before mutation
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
# New top: midpoint between previous row's bottom and this row's top
|
||||
if i == 0:
|
||||
new_top = top_bound
|
||||
else:
|
||||
prev_bot = orig[i - 1][1]
|
||||
my_top = orig[i][0]
|
||||
gap = my_top - prev_bot
|
||||
new_top = prev_bot + gap // 2 if gap > 1 else my_top
|
||||
|
||||
# New bottom: midpoint between this row's bottom and next row's top
|
||||
if i == n - 1:
|
||||
new_bottom = bottom_bound
|
||||
else:
|
||||
my_bot = orig[i][1]
|
||||
next_top = orig[i + 1][0]
|
||||
gap = next_top - my_bot
|
||||
new_bottom = my_bot + gap // 2 if gap > 1 else my_bot
|
||||
|
||||
row.y = new_top
|
||||
row.height = max(5, new_bottom - new_top)
|
||||
|
||||
logger.debug(
|
||||
f"_heal_row_gaps: {n} rows -> y range [{rows[0].y}..{rows[-1].y + rows[-1].height}] "
|
||||
f"(bounds: top={top_bound}, bottom={bottom_bound})"
|
||||
)
|
||||
@@ -0,0 +1,436 @@
|
||||
"""
|
||||
Legacy cell-grid construction (v1) -- DEPRECATED, kept for backward compat.
|
||||
|
||||
Extracted from cv_cell_grid.py. Prefer build_cell_grid_v2 for new code.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
from cv_ocr_engines import (
|
||||
RAPIDOCR_AVAILABLE,
|
||||
_assign_row_words_to_columns,
|
||||
_clean_cell_text,
|
||||
_words_to_reading_order_text,
|
||||
ocr_region_lighton,
|
||||
ocr_region_rapid,
|
||||
ocr_region_trocr,
|
||||
)
|
||||
from cv_cell_grid_helpers import (
|
||||
_MIN_WORD_CONF,
|
||||
_compute_cell_padding,
|
||||
_ensure_minimum_crop_size,
|
||||
_heal_row_gaps,
|
||||
_is_artifact_row,
|
||||
_select_psm_for_column,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _ocr_single_cell — legacy per-cell OCR with multi-level fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ocr_single_cell(
|
||||
row_idx: int,
|
||||
col_idx: int,
|
||||
row: RowGeometry,
|
||||
col: PageRegion,
|
||||
ocr_img: np.ndarray,
|
||||
img_bgr: Optional[np.ndarray],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
use_rapid: bool,
|
||||
engine_name: str,
|
||||
lang: str,
|
||||
lang_map: Dict[str, str],
|
||||
preassigned_words: Optional[List[Dict]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Populate a single cell (column x row intersection) via word lookup."""
|
||||
# Display bbox: exact column x row intersection (no padding)
|
||||
disp_x = col.x
|
||||
disp_y = row.y
|
||||
disp_w = col.width
|
||||
disp_h = row.height
|
||||
|
||||
# OCR crop: adaptive padding -- narrow columns get more context
|
||||
pad = _compute_cell_padding(col.width, img_w)
|
||||
cell_x = max(0, col.x - pad)
|
||||
cell_y = max(0, row.y - pad)
|
||||
cell_w = min(col.width + 2 * pad, img_w - cell_x)
|
||||
cell_h = min(row.height + 2 * pad, img_h - cell_y)
|
||||
is_narrow = (col.width / img_w * 100) < 15 if img_w > 0 else False
|
||||
|
||||
if disp_w <= 0 or disp_h <= 0:
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': '',
|
||||
'confidence': 0.0,
|
||||
'bbox_px': {'x': col.x, 'y': row.y, 'w': col.width, 'h': row.height},
|
||||
'bbox_pct': {
|
||||
'x': round(col.x / img_w * 100, 2),
|
||||
'y': round(row.y / img_h * 100, 2),
|
||||
'w': round(col.width / img_w * 100, 2),
|
||||
'h': round(row.height / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': 'word_lookup',
|
||||
}
|
||||
|
||||
# --- PRIMARY: Word-lookup from full-page Tesseract ---
|
||||
words = preassigned_words if preassigned_words is not None else []
|
||||
used_engine = 'word_lookup'
|
||||
|
||||
# Filter low-confidence words
|
||||
if words:
|
||||
words = [w for w in words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
|
||||
if words:
|
||||
y_tol = max(15, row.height)
|
||||
text = _words_to_reading_order_text(words, y_tolerance_px=y_tol)
|
||||
avg_conf = round(sum(w['conf'] for w in words) / len(words), 1)
|
||||
else:
|
||||
text = ''
|
||||
avg_conf = 0.0
|
||||
|
||||
# --- FALLBACK: Cell-OCR for empty cells ---
|
||||
_run_fallback = False
|
||||
if not text.strip() and cell_w > 0 and cell_h > 0:
|
||||
if ocr_img is not None:
|
||||
crop = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
_run_fallback = dark_ratio > 0.005
|
||||
if _run_fallback:
|
||||
# For narrow columns, upscale the crop before OCR
|
||||
if is_narrow and ocr_img is not None:
|
||||
_crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||||
_upscaled = _ensure_minimum_crop_size(_crop_slice)
|
||||
if _upscaled is not _crop_slice:
|
||||
_up_h, _up_w = _upscaled.shape[:2]
|
||||
_tmp_region = PageRegion(
|
||||
type=col.type, x=0, y=0, width=_up_w, height=_up_h,
|
||||
)
|
||||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
fallback_words = ocr_region(_upscaled, _tmp_region,
|
||||
lang=cell_lang, psm=_cell_psm)
|
||||
# Remap word positions back to original image coordinates
|
||||
_sx = cell_w / max(_up_w, 1)
|
||||
_sy = cell_h / max(_up_h, 1)
|
||||
for _fw in (fallback_words or []):
|
||||
_fw['left'] = int(_fw['left'] * _sx) + cell_x
|
||||
_fw['top'] = int(_fw['top'] * _sy) + cell_y
|
||||
_fw['width'] = int(_fw['width'] * _sx)
|
||||
_fw['height'] = int(_fw['height'] * _sy)
|
||||
else:
|
||||
cell_region = PageRegion(
|
||||
type=col.type, x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
fallback_words = ocr_region(ocr_img, cell_region,
|
||||
lang=cell_lang, psm=_cell_psm)
|
||||
else:
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||||
fallback_words = ocr_region_trocr(img_bgr, cell_region, handwritten=(engine_name == "trocr-handwritten"))
|
||||
elif engine_name == "lighton" and img_bgr is not None:
|
||||
fallback_words = ocr_region_lighton(img_bgr, cell_region)
|
||||
elif use_rapid and img_bgr is not None:
|
||||
fallback_words = ocr_region_rapid(img_bgr, cell_region)
|
||||
else:
|
||||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
fallback_words = ocr_region(ocr_img, cell_region,
|
||||
lang=cell_lang, psm=_cell_psm)
|
||||
|
||||
if fallback_words:
|
||||
fallback_words = [w for w in fallback_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if fallback_words:
|
||||
fb_avg_h = sum(w['height'] for w in fallback_words) / len(fallback_words)
|
||||
fb_y_tol = max(10, int(fb_avg_h * 0.5))
|
||||
fb_text = _words_to_reading_order_text(fallback_words, y_tolerance_px=fb_y_tol)
|
||||
if fb_text.strip():
|
||||
text = fb_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in fallback_words) / len(fallback_words), 1
|
||||
)
|
||||
used_engine = 'cell_ocr_fallback'
|
||||
|
||||
# --- SECONDARY FALLBACK: PSM=7 (single line) for still-empty cells ---
|
||||
if not text.strip() and _run_fallback and not use_rapid:
|
||||
_fb_region = PageRegion(
|
||||
type=col.type, x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
|
||||
if psm7_words:
|
||||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if psm7_words:
|
||||
p7_text = _words_to_reading_order_text(psm7_words, y_tolerance_px=10)
|
||||
if p7_text.strip():
|
||||
text = p7_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in psm7_words) / len(psm7_words), 1
|
||||
)
|
||||
used_engine = 'cell_ocr_psm7'
|
||||
|
||||
# --- TERTIARY FALLBACK: Row-strip re-OCR for narrow columns ---
|
||||
if not text.strip() and is_narrow and img_bgr is not None:
|
||||
row_region = PageRegion(
|
||||
type='_row_strip', x=0, y=row.y,
|
||||
width=img_w, height=row.height,
|
||||
)
|
||||
strip_words = ocr_region_rapid(img_bgr, row_region)
|
||||
if strip_words:
|
||||
col_left = col.x
|
||||
col_right = col.x + col.width
|
||||
col_words = []
|
||||
for sw in strip_words:
|
||||
sw_left = sw.get('left', 0)
|
||||
sw_right = sw_left + sw.get('width', 0)
|
||||
overlap = max(0, min(sw_right, col_right) - max(sw_left, col_left))
|
||||
if overlap > sw.get('width', 1) * 0.3:
|
||||
col_words.append(sw)
|
||||
if col_words:
|
||||
col_words = [w for w in col_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||
if col_words:
|
||||
rs_text = _words_to_reading_order_text(col_words, y_tolerance_px=row.height)
|
||||
if rs_text.strip():
|
||||
text = rs_text
|
||||
avg_conf = round(
|
||||
sum(w['conf'] for w in col_words) / len(col_words), 1
|
||||
)
|
||||
used_engine = 'row_strip_rapid'
|
||||
|
||||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||||
if text.strip():
|
||||
text = _clean_cell_text(text)
|
||||
if not text:
|
||||
avg_conf = 0.0
|
||||
|
||||
return {
|
||||
'cell_id': f"R{row_idx:02d}_C{col_idx}",
|
||||
'row_index': row_idx,
|
||||
'col_index': col_idx,
|
||||
'col_type': col.type,
|
||||
'text': text,
|
||||
'confidence': avg_conf,
|
||||
'bbox_px': {'x': disp_x, 'y': disp_y, 'w': disp_w, 'h': disp_h},
|
||||
'bbox_pct': {
|
||||
'x': round(disp_x / img_w * 100, 2),
|
||||
'y': round(disp_y / img_h * 100, 2),
|
||||
'w': round(disp_w / img_w * 100, 2),
|
||||
'h': round(disp_h / img_h * 100, 2),
|
||||
},
|
||||
'ocr_engine': used_engine,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_cell_grid — legacy grid builder (DEPRECATED)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cell_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Generic Cell-Grid: Columns x Rows -> cells with OCR text.
|
||||
|
||||
DEPRECATED: Use build_cell_grid_v2 instead.
|
||||
"""
|
||||
# Resolve engine choice
|
||||
use_rapid = False
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
else:
|
||||
engine_name = "tesseract"
|
||||
|
||||
logger.info(f"build_cell_grid: using OCR engine '{engine_name}'")
|
||||
|
||||
# Filter to content rows only (skip header/footer)
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid: no content rows found")
|
||||
return [], []
|
||||
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid: no content rows with words found")
|
||||
return [], []
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
logger.warning("build_cell_grid: no usable columns found")
|
||||
return [], []
|
||||
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid: skipped {artifact_skipped} artifact rows (all single-char words)")
|
||||
if not content_rows:
|
||||
logger.warning("build_cell_grid: no content rows after artifact filtering")
|
||||
return [], []
|
||||
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{
|
||||
'index': col_idx,
|
||||
'type': col.type,
|
||||
'x': col.x,
|
||||
'width': col.width,
|
||||
}
|
||||
for col_idx, col in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
cells: List[Dict[str, Any]] = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell = _ocr_single_cell(
|
||||
row_idx, col_idx, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
use_rapid, engine_name, lang, lang_map,
|
||||
preassigned_words=col_words[col_idx],
|
||||
)
|
||||
cells.append(cell)
|
||||
|
||||
# --- BATCH FALLBACK: re-OCR empty cells by column strip ---
|
||||
empty_by_col: Dict[int, List[int]] = {}
|
||||
for ci, cell in enumerate(cells):
|
||||
if not cell['text'].strip() and cell.get('ocr_engine') != 'cell_ocr_psm7':
|
||||
bpx = cell['bbox_px']
|
||||
x, y, w, h = bpx['x'], bpx['y'], bpx['w'], bpx['h']
|
||||
if w > 0 and h > 0 and ocr_img is not None:
|
||||
crop = ocr_img[y:y + h, x:x + w]
|
||||
if crop.size > 0:
|
||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||
if dark_ratio > 0.005:
|
||||
empty_by_col.setdefault(cell['col_index'], []).append(ci)
|
||||
|
||||
for col_idx, cell_indices in empty_by_col.items():
|
||||
if len(cell_indices) < 3:
|
||||
continue
|
||||
|
||||
min_y = min(cells[ci]['bbox_px']['y'] for ci in cell_indices)
|
||||
max_y_h = max(cells[ci]['bbox_px']['y'] + cells[ci]['bbox_px']['h'] for ci in cell_indices)
|
||||
col_x = cells[cell_indices[0]]['bbox_px']['x']
|
||||
col_w = cells[cell_indices[0]]['bbox_px']['w']
|
||||
|
||||
strip_region = PageRegion(
|
||||
type=relevant_cols[col_idx].type,
|
||||
x=col_x, y=min_y,
|
||||
width=col_w, height=max_y_h - min_y,
|
||||
)
|
||||
strip_lang = lang_map.get(relevant_cols[col_idx].type, lang)
|
||||
|
||||
if engine_name in ("trocr-printed", "trocr-handwritten") and img_bgr is not None:
|
||||
strip_words = ocr_region_trocr(img_bgr, strip_region, handwritten=(engine_name == "trocr-handwritten"))
|
||||
elif engine_name == "lighton" and img_bgr is not None:
|
||||
strip_words = ocr_region_lighton(img_bgr, strip_region)
|
||||
elif use_rapid and img_bgr is not None:
|
||||
strip_words = ocr_region_rapid(img_bgr, strip_region)
|
||||
else:
|
||||
strip_words = ocr_region(ocr_img, strip_region, lang=strip_lang, psm=6)
|
||||
|
||||
if not strip_words:
|
||||
continue
|
||||
|
||||
strip_words = [w for w in strip_words if w.get('conf', 0) >= 30]
|
||||
if not strip_words:
|
||||
continue
|
||||
|
||||
for ci in cell_indices:
|
||||
cell_y = cells[ci]['bbox_px']['y']
|
||||
cell_h = cells[ci]['bbox_px']['h']
|
||||
cell_mid_y = cell_y + cell_h / 2
|
||||
|
||||
matched_words = [
|
||||
w for w in strip_words
|
||||
if abs((w['top'] + w['height'] / 2) - cell_mid_y) < cell_h * 0.8
|
||||
]
|
||||
if matched_words:
|
||||
matched_words.sort(key=lambda w: w['left'])
|
||||
batch_text = ' '.join(w['text'] for w in matched_words)
|
||||
batch_text = _clean_cell_text(batch_text)
|
||||
if batch_text.strip():
|
||||
cells[ci]['text'] = batch_text
|
||||
cells[ci]['confidence'] = round(
|
||||
sum(w['conf'] for w in matched_words) / len(matched_words), 1
|
||||
)
|
||||
cells[ci]['ocr_engine'] = 'batch_column_ocr'
|
||||
|
||||
batch_filled = sum(1 for ci in cell_indices if cells[ci]['text'].strip())
|
||||
if batch_filled > 0:
|
||||
logger.info(
|
||||
f"build_cell_grid: batch OCR filled {batch_filled}/{len(cell_indices)} "
|
||||
f"empty cells in column {col_idx}"
|
||||
)
|
||||
|
||||
# Remove all-empty rows
|
||||
rows_with_text: set = set()
|
||||
for cell in cells:
|
||||
if cell['text'].strip():
|
||||
rows_with_text.add(cell['row_index'])
|
||||
before_filter = len(cells)
|
||||
cells = [c for c in cells if c['row_index'] in rows_with_text]
|
||||
empty_rows_removed = (before_filter - len(cells)) // max(len(relevant_cols), 1)
|
||||
if empty_rows_removed > 0:
|
||||
logger.info(f"build_cell_grid: removed {empty_rows_removed} all-empty rows after OCR")
|
||||
|
||||
logger.info(f"build_cell_grid: {len(cells)} cells from "
|
||||
f"{len(content_rows)} rows x {len(relevant_cols)} columns, "
|
||||
f"engine={engine_name}")
|
||||
|
||||
return cells, columns_meta
|
||||
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
Row-merging logic for vocabulary entries (phonetic, wrapped, continuation rows).
|
||||
|
||||
Extracted from cv_cell_grid.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_ocr_engines import _RE_ALPHA
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Regex: line starts with phonetic bracket content only (no real word before it)
|
||||
_PHONETIC_ONLY_RE = re.compile(
|
||||
r'''^\s*[\[\('"]*[^\]]*[\])\s]*$'''
|
||||
)
|
||||
|
||||
|
||||
def _is_phonetic_only_text(text: str) -> bool:
|
||||
"""Check if text consists only of phonetic transcription.
|
||||
|
||||
Phonetic-only patterns:
|
||||
['mani serva] -> True
|
||||
[dance] -> True
|
||||
["a:mand] -> True
|
||||
almond ['a:mand] -> False (has real word before bracket)
|
||||
Mandel -> False
|
||||
"""
|
||||
t = text.strip()
|
||||
if not t:
|
||||
return False
|
||||
# Must contain at least one bracket
|
||||
if '[' not in t and ']' not in t:
|
||||
return False
|
||||
# Remove all bracket content and surrounding punctuation/whitespace
|
||||
without_brackets = re.sub(r"\[.*?\]", '', t)
|
||||
without_brackets = re.sub(r"[\[\]'\"()\s]", '', without_brackets)
|
||||
# If nothing meaningful remains, it's phonetic-only
|
||||
alpha_remaining = ''.join(_RE_ALPHA.findall(without_brackets))
|
||||
return len(alpha_remaining) < 2
|
||||
|
||||
|
||||
def _merge_phonetic_continuation_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Merge rows that contain only phonetic transcription into previous entry.
|
||||
|
||||
In dictionary pages, phonetic transcription sometimes wraps to the next
|
||||
row. E.g.:
|
||||
Row 28: EN="it's a money-saver" DE="es spart Kosten"
|
||||
Row 29: EN="['mani serva]" DE=""
|
||||
|
||||
Row 29 is phonetic-only -> merge into row 28's EN field.
|
||||
"""
|
||||
if len(entries) < 2:
|
||||
return entries
|
||||
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
en = (entry.get('english') or '').strip()
|
||||
de = (entry.get('german') or '').strip()
|
||||
ex = (entry.get('example') or '').strip()
|
||||
|
||||
# Check if this entry is phonetic-only (EN has only phonetics, DE empty)
|
||||
if merged and _is_phonetic_only_text(en) and not de:
|
||||
prev = merged[-1]
|
||||
prev_en = (prev.get('english') or '').strip()
|
||||
# Append phonetic to previous entry's EN
|
||||
if prev_en:
|
||||
prev['english'] = prev_en + ' ' + en
|
||||
else:
|
||||
prev['english'] = en
|
||||
# If there was an example, append to previous too
|
||||
if ex:
|
||||
prev_ex = (prev.get('example') or '').strip()
|
||||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||||
logger.debug(
|
||||
f"Merged phonetic row {entry.get('row_index')} "
|
||||
f"into previous entry: {prev['english']!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(entry)
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_wrapped_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Merge rows where the primary column (EN) is empty -- cell wrap continuation.
|
||||
|
||||
In textbook vocabulary tables, columns are often narrow, so the author
|
||||
wraps text within a cell. OCR treats each physical line as a separate row.
|
||||
The key indicator: if the EN column is empty but DE/example have text,
|
||||
this row is a continuation of the previous row's cells.
|
||||
|
||||
Example (original textbook has ONE row):
|
||||
Row 2: EN="take part (in)" DE="teilnehmen (an), mitmachen" EX="More than 200 singers took"
|
||||
Row 3: EN="" DE="(bei)" EX="part in the concert."
|
||||
-> Merged: EN="take part (in)" DE="teilnehmen (an), mitmachen (bei)" EX="..."
|
||||
|
||||
Also handles the reverse case: DE empty but EN has text (wrap in EN column).
|
||||
"""
|
||||
if len(entries) < 2:
|
||||
return entries
|
||||
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
en = (entry.get('english') or '').strip()
|
||||
de = (entry.get('german') or '').strip()
|
||||
ex = (entry.get('example') or '').strip()
|
||||
|
||||
if not merged:
|
||||
merged.append(entry)
|
||||
continue
|
||||
|
||||
prev = merged[-1]
|
||||
prev_en = (prev.get('english') or '').strip()
|
||||
prev_de = (prev.get('german') or '').strip()
|
||||
prev_ex = (prev.get('example') or '').strip()
|
||||
|
||||
# Case 1: EN is empty -> continuation of previous row
|
||||
if not en and (de or ex) and prev_en:
|
||||
if de:
|
||||
if prev_de.endswith(','):
|
||||
sep = ' '
|
||||
elif prev_de.endswith(('-', '(')):
|
||||
sep = ''
|
||||
else:
|
||||
sep = ' '
|
||||
prev['german'] = (prev_de + sep + de).strip()
|
||||
if ex:
|
||||
sep = ' ' if prev_ex else ''
|
||||
prev['example'] = (prev_ex + sep + ex).strip()
|
||||
logger.debug(
|
||||
f"Merged wrapped row {entry.get('row_index')} into previous "
|
||||
f"(empty EN): DE={prev['german']!r}, EX={prev.get('example', '')!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Case 2: DE is empty, EN has text that looks like continuation
|
||||
if en and not de and prev_de:
|
||||
is_paren = en.startswith('(')
|
||||
first_alpha = next((c for c in en if c.isalpha()), '')
|
||||
starts_lower = first_alpha and first_alpha.islower()
|
||||
|
||||
if (is_paren or starts_lower) and len(en.split()) < 5:
|
||||
sep = ' ' if prev_en and not prev_en.endswith((',', '-', '(')) else ''
|
||||
prev['english'] = (prev_en + sep + en).strip()
|
||||
if ex:
|
||||
sep2 = ' ' if prev_ex else ''
|
||||
prev['example'] = (prev_ex + sep2 + ex).strip()
|
||||
logger.debug(
|
||||
f"Merged wrapped row {entry.get('row_index')} into previous "
|
||||
f"(empty DE): EN={prev['english']!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(entry)
|
||||
|
||||
if len(merged) < len(entries):
|
||||
logger.info(
|
||||
f"_merge_wrapped_rows: merged {len(entries) - len(merged)} "
|
||||
f"continuation rows ({len(entries)} -> {len(merged)})"
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_continuation_rows(
|
||||
entries: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Merge multi-line vocabulary entries where text wraps to the next row.
|
||||
|
||||
A row is a continuation of the previous entry when:
|
||||
- EN has text, but DE is empty
|
||||
- EN starts with a lowercase letter (not a new vocab entry)
|
||||
- Previous entry's EN does NOT end with a sentence terminator (.!?)
|
||||
- The continuation text has fewer than 4 words (not an example sentence)
|
||||
- The row was not already merged as phonetic
|
||||
|
||||
Example:
|
||||
Row 5: EN="to put up" DE="aufstellen"
|
||||
Row 6: EN="with sth." DE=""
|
||||
-> Merged: EN="to put up with sth." DE="aufstellen"
|
||||
"""
|
||||
if len(entries) < 2:
|
||||
return entries
|
||||
|
||||
merged: List[Dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
en = (entry.get('english') or '').strip()
|
||||
de = (entry.get('german') or '').strip()
|
||||
|
||||
if merged and en and not de:
|
||||
# Check: not phonetic (already handled)
|
||||
if _is_phonetic_only_text(en):
|
||||
merged.append(entry)
|
||||
continue
|
||||
|
||||
# Check: starts with lowercase
|
||||
first_alpha = next((c for c in en if c.isalpha()), '')
|
||||
starts_lower = first_alpha and first_alpha.islower()
|
||||
|
||||
# Check: fewer than 4 words (not an example sentence)
|
||||
word_count = len(en.split())
|
||||
is_short = word_count < 4
|
||||
|
||||
# Check: previous entry doesn't end with sentence terminator
|
||||
prev = merged[-1]
|
||||
prev_en = (prev.get('english') or '').strip()
|
||||
prev_ends_sentence = prev_en and prev_en[-1] in '.!?'
|
||||
|
||||
if starts_lower and is_short and not prev_ends_sentence:
|
||||
# Merge into previous entry
|
||||
prev['english'] = (prev_en + ' ' + en).strip()
|
||||
# Merge example if present
|
||||
ex = (entry.get('example') or '').strip()
|
||||
if ex:
|
||||
prev_ex = (prev.get('example') or '').strip()
|
||||
prev['example'] = (prev_ex + ' ' + ex).strip() if prev_ex else ex
|
||||
logger.debug(
|
||||
f"Merged continuation row {entry.get('row_index')} "
|
||||
f"into previous entry: {prev['english']!r}"
|
||||
)
|
||||
continue
|
||||
|
||||
merged.append(entry)
|
||||
|
||||
return merged
|
||||
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
Streaming variants of cell-grid builders (v2 + legacy).
|
||||
|
||||
Extracted from cv_cell_grid.py. These yield cells one-by-one as OCR'd,
|
||||
useful for progress reporting.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Generator, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
from cv_ocr_engines import (
|
||||
RAPIDOCR_AVAILABLE,
|
||||
_assign_row_words_to_columns,
|
||||
)
|
||||
from cv_cell_grid_helpers import (
|
||||
_heal_row_gaps,
|
||||
_is_artifact_row,
|
||||
)
|
||||
from cv_cell_grid_build import _ocr_cell_crop
|
||||
from cv_cell_grid_legacy import _ocr_single_cell
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_cell_grid_v2_streaming
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cell_grid_v2_streaming(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||||
"""Streaming variant of build_cell_grid_v2 -- yields each cell as OCR'd.
|
||||
|
||||
Yields:
|
||||
(cell_dict, columns_meta, total_cells)
|
||||
"""
|
||||
use_rapid = False
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
engine_name = "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
else:
|
||||
engine_name = "tesseract"
|
||||
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top',
|
||||
'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
return
|
||||
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
# Use header/footer boundaries for heal_row_gaps
|
||||
content_rows.sort(key=lambda r: r.y)
|
||||
header_rows = [r for r in row_geometries if r.row_type == 'header']
|
||||
footer_rows = [r for r in row_geometries if r.row_type == 'footer']
|
||||
if header_rows:
|
||||
top_bound = max(r.y + r.height for r in header_rows)
|
||||
else:
|
||||
top_bound = content_rows[0].y
|
||||
if footer_rows:
|
||||
bottom_bound = min(r.y for r in footer_rows)
|
||||
else:
|
||||
bottom_bound = content_rows[-1].y + content_rows[-1].height
|
||||
|
||||
_heal_row_gaps(content_rows, top_bound=top_bound, bottom_bound=bottom_bound)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{'index': ci, 'type': c.type, 'x': c.x, 'width': c.width}
|
||||
for ci, c in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
total_cells = len(content_rows) * len(relevant_cols)
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell = _ocr_cell_crop(
|
||||
row_idx, col_idx, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
engine_name, lang, lang_map,
|
||||
)
|
||||
yield cell, columns_meta, total_cells
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_cell_grid_streaming — legacy streaming variant
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_cell_grid_streaming(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr: Optional[np.ndarray] = None,
|
||||
) -> Generator[Tuple[Dict[str, Any], List[Dict[str, Any]], int], None, None]:
|
||||
"""Like build_cell_grid(), but yields each cell as it is OCR'd.
|
||||
|
||||
DEPRECATED: Use build_cell_grid_v2_streaming instead.
|
||||
|
||||
Yields:
|
||||
(cell_dict, columns_meta, total_cells) for each cell.
|
||||
"""
|
||||
use_rapid = False
|
||||
if ocr_engine in ("trocr-printed", "trocr-handwritten", "lighton"):
|
||||
engine_name = ocr_engine
|
||||
elif ocr_engine == "auto":
|
||||
use_rapid = RAPIDOCR_AVAILABLE and img_bgr is not None
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
elif ocr_engine == "rapid":
|
||||
if not RAPIDOCR_AVAILABLE:
|
||||
logger.warning("RapidOCR requested but not available, falling back to Tesseract")
|
||||
else:
|
||||
use_rapid = True
|
||||
engine_name = "rapid" if use_rapid else "tesseract"
|
||||
else:
|
||||
engine_name = "tesseract"
|
||||
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
before = len(content_rows)
|
||||
content_rows = [r for r in content_rows if r.word_count > 0]
|
||||
skipped = before - len(content_rows)
|
||||
if skipped > 0:
|
||||
logger.info(f"build_cell_grid_streaming: skipped {skipped} phantom rows (word_count=0)")
|
||||
if not content_rows:
|
||||
return
|
||||
|
||||
_skip_types = {'column_ignore', 'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
relevant_cols = [c for c in column_regions if c.type not in _skip_types]
|
||||
if not relevant_cols:
|
||||
return
|
||||
|
||||
before_art = len(content_rows)
|
||||
content_rows = [r for r in content_rows if not _is_artifact_row(r)]
|
||||
artifact_skipped = before_art - len(content_rows)
|
||||
if artifact_skipped > 0:
|
||||
logger.info(f"build_cell_grid_streaming: skipped {artifact_skipped} artifact rows")
|
||||
if not content_rows:
|
||||
return
|
||||
_heal_row_gaps(
|
||||
content_rows,
|
||||
top_bound=min(c.y for c in relevant_cols),
|
||||
bottom_bound=max(c.y + c.height for c in relevant_cols),
|
||||
)
|
||||
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
columns_meta = [
|
||||
{
|
||||
'index': col_idx,
|
||||
'type': col.type,
|
||||
'x': col.x,
|
||||
'width': col.width,
|
||||
}
|
||||
for col_idx, col in enumerate(relevant_cols)
|
||||
]
|
||||
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
total_cells = len(content_rows) * len(relevant_cols)
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
col_words = _assign_row_words_to_columns(row, relevant_cols)
|
||||
for col_idx, col in enumerate(relevant_cols):
|
||||
cell = _ocr_single_cell(
|
||||
row_idx, col_idx, row, col,
|
||||
ocr_img, img_bgr, img_w, img_h,
|
||||
use_rapid, engine_name, lang, lang_map,
|
||||
preassigned_words=col_words[col_idx],
|
||||
)
|
||||
yield cell, columns_meta, total_cells
|
||||
@@ -0,0 +1,200 @@
|
||||
"""
|
||||
Vocabulary extraction: cells -> vocab entries, and build_word_grid wrapper.
|
||||
|
||||
Extracted from cv_cell_grid.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_attach_example_sentences,
|
||||
_fix_phonetic_brackets,
|
||||
_split_comma_entries,
|
||||
)
|
||||
from cv_cell_grid_legacy import build_cell_grid
|
||||
from cv_cell_grid_merge import (
|
||||
_merge_continuation_rows,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_wrapped_rows,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cells_to_vocab_entries(
|
||||
cells: List[Dict[str, Any]],
|
||||
columns_meta: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Map generic cells to vocab entries with english/german/example fields.
|
||||
|
||||
Groups cells by row_index, maps col_type -> field name, and produces
|
||||
one entry per row (only rows with at least one non-empty field).
|
||||
"""
|
||||
col_type_to_field = {
|
||||
'column_en': 'english',
|
||||
'column_de': 'german',
|
||||
'column_example': 'example',
|
||||
'page_ref': 'source_page',
|
||||
'column_marker': 'marker',
|
||||
'column_text': 'text', # generic single-column (box sub-sessions)
|
||||
}
|
||||
bbox_key_map = {
|
||||
'column_en': 'bbox_en',
|
||||
'column_de': 'bbox_de',
|
||||
'column_example': 'bbox_ex',
|
||||
'page_ref': 'bbox_ref',
|
||||
'column_marker': 'bbox_marker',
|
||||
'column_text': 'bbox_text',
|
||||
}
|
||||
|
||||
# Group cells by row_index
|
||||
rows: Dict[int, List[Dict]] = {}
|
||||
for cell in cells:
|
||||
ri = cell['row_index']
|
||||
rows.setdefault(ri, []).append(cell)
|
||||
|
||||
entries: List[Dict[str, Any]] = []
|
||||
for row_idx in sorted(rows.keys()):
|
||||
row_cells = rows[row_idx]
|
||||
entry: Dict[str, Any] = {
|
||||
'row_index': row_idx,
|
||||
'english': '',
|
||||
'german': '',
|
||||
'example': '',
|
||||
'text': '', # generic single-column (box sub-sessions)
|
||||
'source_page': '',
|
||||
'marker': '',
|
||||
'confidence': 0.0,
|
||||
'bbox': None,
|
||||
'bbox_en': None,
|
||||
'bbox_de': None,
|
||||
'bbox_ex': None,
|
||||
'bbox_ref': None,
|
||||
'bbox_marker': None,
|
||||
'bbox_text': None,
|
||||
'ocr_engine': row_cells[0].get('ocr_engine', '') if row_cells else '',
|
||||
}
|
||||
|
||||
confidences = []
|
||||
for cell in row_cells:
|
||||
col_type = cell['col_type']
|
||||
field = col_type_to_field.get(col_type)
|
||||
if field:
|
||||
entry[field] = cell['text']
|
||||
bbox_field = bbox_key_map.get(col_type)
|
||||
if bbox_field:
|
||||
entry[bbox_field] = cell['bbox_pct']
|
||||
if cell['confidence'] > 0:
|
||||
confidences.append(cell['confidence'])
|
||||
|
||||
# Compute row-level bbox as union of all cell bboxes
|
||||
all_bboxes = [c['bbox_pct'] for c in row_cells if c.get('bbox_pct')]
|
||||
if all_bboxes:
|
||||
min_x = min(b['x'] for b in all_bboxes)
|
||||
min_y = min(b['y'] for b in all_bboxes)
|
||||
max_x2 = max(b['x'] + b['w'] for b in all_bboxes)
|
||||
max_y2 = max(b['y'] + b['h'] for b in all_bboxes)
|
||||
entry['bbox'] = {
|
||||
'x': round(min_x, 2),
|
||||
'y': round(min_y, 2),
|
||||
'w': round(max_x2 - min_x, 2),
|
||||
'h': round(max_y2 - min_y, 2),
|
||||
}
|
||||
|
||||
entry['confidence'] = round(
|
||||
sum(confidences) / len(confidences), 1
|
||||
) if confidences else 0.0
|
||||
|
||||
# Only include if at least one mapped field has text
|
||||
has_content = any(
|
||||
entry.get(f)
|
||||
for f in col_type_to_field.values()
|
||||
)
|
||||
if has_content:
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def build_word_grid(
|
||||
ocr_img,
|
||||
column_regions,
|
||||
row_geometries,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
ocr_engine: str = "auto",
|
||||
img_bgr=None,
|
||||
pronunciation: str = "british",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Vocab-specific: Cell-Grid + Vocab-Mapping + Post-Processing.
|
||||
|
||||
Wrapper around build_cell_grid() that adds vocabulary-specific logic:
|
||||
- Maps cells to english/german/example entries
|
||||
- Applies character confusion fixes, IPA lookup, comma splitting, etc.
|
||||
- Falls back to returning raw cells if no vocab columns detected.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image (for Tesseract).
|
||||
column_regions: Classified columns from Step 3.
|
||||
row_geometries: Rows from Step 4.
|
||||
img_w, img_h: Image dimensions.
|
||||
lang: Default Tesseract language.
|
||||
ocr_engine: 'tesseract', 'rapid', or 'auto'.
|
||||
img_bgr: BGR color image (required for RapidOCR).
|
||||
pronunciation: 'british' or 'american' for IPA lookup.
|
||||
|
||||
Returns:
|
||||
List of entry dicts with english/german/example text and bbox info (percent).
|
||||
"""
|
||||
cells, columns_meta = build_cell_grid(
|
||||
ocr_img, column_regions, row_geometries, img_w, img_h,
|
||||
lang=lang, ocr_engine=ocr_engine, img_bgr=img_bgr,
|
||||
)
|
||||
|
||||
if not cells:
|
||||
return []
|
||||
|
||||
# Check if vocab layout is present
|
||||
col_types = {c['type'] for c in columns_meta}
|
||||
if not (col_types & {'column_en', 'column_de'}):
|
||||
logger.info("build_word_grid: no vocab columns -- returning raw cells")
|
||||
return cells
|
||||
|
||||
# Vocab mapping: cells -> entries
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
|
||||
# --- Post-processing pipeline (deterministic, no LLM) ---
|
||||
n_raw = len(entries)
|
||||
|
||||
# 0. Merge cell-wrap continuation rows (empty primary column = text wrap)
|
||||
entries = _merge_wrapped_rows(entries)
|
||||
|
||||
# 0a. Merge phonetic-only continuation rows into previous entry
|
||||
entries = _merge_phonetic_continuation_rows(entries)
|
||||
|
||||
# 0b. Merge multi-line continuation rows (lowercase EN, empty DE)
|
||||
entries = _merge_continuation_rows(entries)
|
||||
|
||||
# 1. Character confusion (| -> I, 1 -> I, 8 -> B) is now run in
|
||||
# llm_review_entries_streaming so changes are visible to the user in Step 6.
|
||||
|
||||
# 2. Replace OCR'd phonetics with dictionary IPA
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation=pronunciation)
|
||||
|
||||
# 3. Split comma-separated word forms (break, broke, broken -> 3 entries)
|
||||
entries = _split_comma_entries(entries)
|
||||
|
||||
# 4. Attach example sentences (rows without DE -> examples for preceding entry)
|
||||
entries = _attach_example_sentences(entries)
|
||||
|
||||
engine_name = cells[0].get('ocr_engine', 'unknown') if cells else 'unknown'
|
||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||
f"{n_raw} raw -> {len(entries)} after post-processing "
|
||||
f"(engine={engine_name})")
|
||||
|
||||
return entries
|
||||
@@ -0,0 +1,471 @@
|
||||
"""
|
||||
Embedded box detection and page zone splitting for the CV vocabulary pipeline.
|
||||
|
||||
Detects boxes (grammar tips, exercises, etc.) that span the page width and
|
||||
interrupt the normal column layout. Splits the page into vertical zones so
|
||||
that column detection can run independently per zone.
|
||||
|
||||
Two-stage algorithm (both run, results merged):
|
||||
1. Morphological line detection — finds bordered boxes via horizontal lines.
|
||||
2. Background shading detection — finds shaded/colored boxes via median-blur
|
||||
background analysis. Works for colored (blue, green) and grayscale
|
||||
(gray shading on B/W scans) boxes.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import DetectedBox, PageZone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"detect_boxes",
|
||||
"split_page_into_zones",
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stage 1: Morphological line detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_boxes_by_lines(
|
||||
gray: np.ndarray,
|
||||
content_x: int,
|
||||
content_w: int,
|
||||
content_y: int,
|
||||
content_h: int,
|
||||
) -> List[DetectedBox]:
|
||||
"""Find boxes defined by pairs of long horizontal border lines.
|
||||
|
||||
Args:
|
||||
gray: Grayscale image (full page).
|
||||
content_x, content_w: Horizontal content bounds.
|
||||
content_y, content_h: Vertical content bounds.
|
||||
|
||||
Returns:
|
||||
List of DetectedBox for each detected bordered box.
|
||||
"""
|
||||
h, w = gray.shape[:2]
|
||||
|
||||
# Binarize: dark pixels → white on black background
|
||||
_, binary = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
|
||||
|
||||
# Horizontal morphology kernel — at least 50% of content width
|
||||
kernel_w = max(50, content_w // 2)
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
|
||||
lines_img = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
|
||||
|
||||
# Horizontal projection: count line pixels per row
|
||||
h_proj = np.sum(lines_img[:, content_x:content_x + content_w] > 0, axis=1)
|
||||
line_threshold = content_w * 0.30
|
||||
|
||||
# Group consecutive rows with enough line pixels into line segments
|
||||
line_segments: List[Tuple[int, int]] = [] # (y_start, y_end)
|
||||
seg_start: Optional[int] = None
|
||||
for y in range(h):
|
||||
if h_proj[y] >= line_threshold:
|
||||
if seg_start is None:
|
||||
seg_start = y
|
||||
else:
|
||||
if seg_start is not None:
|
||||
line_segments.append((seg_start, y))
|
||||
seg_start = None
|
||||
if seg_start is not None:
|
||||
line_segments.append((seg_start, h))
|
||||
|
||||
if len(line_segments) < 2:
|
||||
return []
|
||||
|
||||
# Pair lines into boxes: top-line + bottom-line
|
||||
# Minimum box height: 30px. Maximum: 70% of content height.
|
||||
min_box_h = 30
|
||||
max_box_h = int(content_h * 0.70)
|
||||
|
||||
boxes: List[DetectedBox] = []
|
||||
used = set()
|
||||
for i, (top_start, top_end) in enumerate(line_segments):
|
||||
if i in used:
|
||||
continue
|
||||
for j in range(i + 1, len(line_segments)):
|
||||
if j in used:
|
||||
continue
|
||||
bot_start, bot_end = line_segments[j]
|
||||
box_y = top_start
|
||||
box_h = bot_end - top_start
|
||||
if box_h < min_box_h or box_h > max_box_h:
|
||||
continue
|
||||
|
||||
# Estimate border thickness from line segment heights
|
||||
border_top = top_end - top_start
|
||||
border_bot = bot_end - bot_start
|
||||
|
||||
box = DetectedBox(
|
||||
x=content_x,
|
||||
y=box_y,
|
||||
width=content_w,
|
||||
height=box_h,
|
||||
confidence=0.8,
|
||||
border_thickness=max(border_top, border_bot),
|
||||
)
|
||||
boxes.append(box)
|
||||
used.add(i)
|
||||
used.add(j)
|
||||
break # move to next top-line candidate
|
||||
|
||||
return boxes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stage 2: Background shading detection (color + grayscale)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_boxes_by_shading(
|
||||
img_bgr: np.ndarray,
|
||||
content_x: int,
|
||||
content_w: int,
|
||||
content_y: int,
|
||||
content_h: int,
|
||||
) -> List[DetectedBox]:
|
||||
"""Find boxes with shaded/colored background (no visible border lines).
|
||||
|
||||
Uses heavy median blur to remove text and reveal the underlying background.
|
||||
Then detects rectangular regions where the background differs from white.
|
||||
Works for both colored boxes (blue, green) and grayscale shading (gray on
|
||||
B/W scans).
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image (full page).
|
||||
content_x, content_w: Horizontal content bounds.
|
||||
content_y, content_h: Vertical content bounds.
|
||||
|
||||
Returns:
|
||||
List of DetectedBox for each detected shaded box.
|
||||
"""
|
||||
h, w = img_bgr.shape[:2]
|
||||
|
||||
# --- Heavy median blur removes text strokes, keeps background ---
|
||||
blur_size = 31 # large kernel to wipe out text
|
||||
blurred = cv2.medianBlur(img_bgr, blur_size)
|
||||
blur_gray = cv2.cvtColor(blurred, cv2.COLOR_BGR2GRAY)
|
||||
blur_hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
|
||||
|
||||
# Estimate page background from top-left / top-right corners
|
||||
corner_size = max(20, min(h // 10, w // 10))
|
||||
corners = np.concatenate([
|
||||
blur_gray[:corner_size, :corner_size].ravel(),
|
||||
blur_gray[:corner_size, -corner_size:].ravel(),
|
||||
])
|
||||
page_bg = float(np.median(corners))
|
||||
|
||||
# Two masks: grayscale shading + color saturation
|
||||
# Grayscale: regions noticeably darker than the page background
|
||||
shade_thresh = max(page_bg - 30, 150)
|
||||
gray_mask = (blur_gray < shade_thresh).astype(np.uint8) * 255
|
||||
|
||||
# Color: regions with noticeable saturation (blue/green/etc. boxes)
|
||||
sat_mask = (blur_hsv[:, :, 1] > 20).astype(np.uint8) * 255
|
||||
|
||||
combined = cv2.bitwise_or(gray_mask, sat_mask)
|
||||
|
||||
# Morphological cleanup: close gaps, remove small noise
|
||||
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 10))
|
||||
combined = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
|
||||
combined = cv2.morphologyEx(combined, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
contours, _ = cv2.findContours(combined, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
# Size thresholds: smaller boxes allowed (e.g. "German leihen" ~30% width)
|
||||
min_area = content_w * 30 # at least 30px tall at full width
|
||||
min_box_h = 25
|
||||
max_box_h = int(content_h * 0.70)
|
||||
min_width_ratio = 0.25 # boxes can be ~25% of content width
|
||||
|
||||
boxes: List[DetectedBox] = []
|
||||
for cnt in contours:
|
||||
area = cv2.contourArea(cnt)
|
||||
if area < min_area:
|
||||
continue
|
||||
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
|
||||
# Width filter
|
||||
if bw < content_w * min_width_ratio:
|
||||
continue
|
||||
|
||||
# Height filter
|
||||
if bh < min_box_h or bh > max_box_h:
|
||||
continue
|
||||
|
||||
# Rectangularity check: area / bounding-rect area > 0.6
|
||||
rect_area = bw * bh
|
||||
if rect_area > 0 and area / rect_area < 0.5:
|
||||
continue
|
||||
|
||||
# Verify that the background inside this region is actually shaded
|
||||
roi_gray = blur_gray[by:by + bh, bx:bx + bw]
|
||||
roi_hsv = blur_hsv[by:by + bh, bx:bx + bw]
|
||||
if roi_gray.size == 0:
|
||||
continue
|
||||
|
||||
median_val = float(np.median(roi_gray))
|
||||
median_sat = float(np.median(roi_hsv[:, :, 1]))
|
||||
|
||||
# Must be noticeably different from page background
|
||||
is_shaded = median_val < (page_bg - 15)
|
||||
is_colored = median_sat > 15
|
||||
|
||||
if not is_shaded and not is_colored:
|
||||
continue
|
||||
|
||||
conf = 0.7 if is_colored else 0.6
|
||||
|
||||
boxes.append(DetectedBox(
|
||||
x=bx,
|
||||
y=by,
|
||||
width=bw,
|
||||
height=bh,
|
||||
confidence=conf,
|
||||
border_thickness=0,
|
||||
))
|
||||
|
||||
return boxes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _validate_box(
|
||||
box: DetectedBox,
|
||||
gray: np.ndarray,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
median_row_gap: int,
|
||||
) -> bool:
|
||||
"""Validate that a detected box is genuine (not a table-row separator etc.)."""
|
||||
# Must span > 25% of content width (lowered from 60% to allow smaller boxes)
|
||||
if box.width < content_w * 0.25:
|
||||
return False
|
||||
|
||||
# Height constraints
|
||||
if box.height < 25 or box.height > content_h * 0.70:
|
||||
return False
|
||||
|
||||
# Must not be confused with a table-row separator:
|
||||
# real boxes are at least 3x the median row gap
|
||||
if median_row_gap > 0 and box.height < median_row_gap * 3:
|
||||
return False
|
||||
|
||||
# Must contain some text (ink density check)
|
||||
h, w = gray.shape[:2]
|
||||
y1 = max(0, box.y)
|
||||
y2 = min(h, box.y + box.height)
|
||||
x1 = max(0, box.x)
|
||||
x2 = min(w, box.x + box.width)
|
||||
roi = gray[y1:y2, x1:x2]
|
||||
if roi.size == 0:
|
||||
return False
|
||||
ink_ratio = np.sum(roi < 128) / roi.size
|
||||
if ink_ratio < 0.002: # nearly empty → not a real content box
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API: detect_boxes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _merge_overlapping_boxes(boxes: List[DetectedBox]) -> List[DetectedBox]:
|
||||
"""Merge boxes that overlap significantly (IoU > 0.3 or one contains the other).
|
||||
|
||||
When two boxes overlap, keep the one with higher confidence (or the larger
|
||||
one if confidences are equal).
|
||||
"""
|
||||
if len(boxes) <= 1:
|
||||
return boxes
|
||||
|
||||
# Sort by area descending so larger boxes are processed first
|
||||
boxes = sorted(boxes, key=lambda b: b.width * b.height, reverse=True)
|
||||
keep = [True] * len(boxes)
|
||||
|
||||
for i in range(len(boxes)):
|
||||
if not keep[i]:
|
||||
continue
|
||||
bi = boxes[i]
|
||||
for j in range(i + 1, len(boxes)):
|
||||
if not keep[j]:
|
||||
continue
|
||||
bj = boxes[j]
|
||||
|
||||
# Compute overlap
|
||||
x1 = max(bi.x, bj.x)
|
||||
y1 = max(bi.y, bj.y)
|
||||
x2 = min(bi.x + bi.width, bj.x + bj.width)
|
||||
y2 = min(bi.y + bi.height, bj.y + bj.height)
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
continue # no overlap
|
||||
|
||||
inter = (x2 - x1) * (y2 - y1)
|
||||
area_i = bi.width * bi.height
|
||||
area_j = bj.width * bj.height
|
||||
smaller_area = min(area_i, area_j)
|
||||
|
||||
# If overlap covers > 50% of the smaller box, merge (drop the weaker)
|
||||
if smaller_area > 0 and inter / smaller_area > 0.50:
|
||||
# Keep the one with higher confidence; if equal, keep larger
|
||||
if bj.confidence > bi.confidence:
|
||||
keep[i] = False
|
||||
break
|
||||
else:
|
||||
keep[j] = False
|
||||
|
||||
return [b for b, k in zip(boxes, keep) if k]
|
||||
|
||||
|
||||
def detect_boxes(
|
||||
img_bgr: np.ndarray,
|
||||
content_x: int,
|
||||
content_w: int,
|
||||
content_y: int,
|
||||
content_h: int,
|
||||
median_row_gap: int = 0,
|
||||
) -> List[DetectedBox]:
|
||||
"""Detect embedded boxes on a page image.
|
||||
|
||||
Runs BOTH line-based and shading-based detection, then merges and
|
||||
deduplicates results.
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image (full page or cropped).
|
||||
content_x, content_w: Horizontal content bounds.
|
||||
content_y, content_h: Vertical content bounds.
|
||||
median_row_gap: Median row gap height (for filtering out table separators).
|
||||
|
||||
Returns:
|
||||
List of validated DetectedBox instances, sorted by y position.
|
||||
"""
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Stage 1: Line-based detection (bordered boxes)
|
||||
line_boxes = _detect_boxes_by_lines(gray, content_x, content_w, content_y, content_h)
|
||||
|
||||
# Stage 2: Shading-based detection (colored/gray background boxes)
|
||||
shade_boxes = _detect_boxes_by_shading(img_bgr, content_x, content_w, content_y, content_h)
|
||||
|
||||
logger.debug("BoxDetect: %d line-based, %d shading-based candidates",
|
||||
len(line_boxes), len(shade_boxes))
|
||||
|
||||
# Combine and deduplicate
|
||||
all_boxes = line_boxes + shade_boxes
|
||||
merged = _merge_overlapping_boxes(all_boxes)
|
||||
|
||||
# Validate
|
||||
validated = [b for b in merged if _validate_box(b, gray, content_w, content_h, median_row_gap)]
|
||||
|
||||
# Sort top to bottom
|
||||
validated.sort(key=lambda b: b.y)
|
||||
|
||||
if validated:
|
||||
logger.info("BoxDetect: %d box(es) detected (line=%d, shade=%d, merged=%d)",
|
||||
len(validated), len(line_boxes), len(shade_boxes), len(merged))
|
||||
else:
|
||||
logger.debug("BoxDetect: no boxes detected")
|
||||
|
||||
return validated
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Zone Splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def split_page_into_zones(
|
||||
content_x: int,
|
||||
content_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
boxes: List[DetectedBox],
|
||||
min_zone_height: int = 40,
|
||||
) -> List[PageZone]:
|
||||
"""Split a page into vertical zones based on detected boxes.
|
||||
|
||||
Regions above, between, and below boxes become 'content' zones;
|
||||
box regions become 'box' zones.
|
||||
|
||||
Args:
|
||||
content_x, content_y, content_w, content_h: Content area bounds.
|
||||
boxes: Detected boxes, sorted by y position.
|
||||
min_zone_height: Minimum height for a content zone to be kept.
|
||||
|
||||
Returns:
|
||||
List of PageZone, ordered top to bottom.
|
||||
"""
|
||||
if not boxes:
|
||||
# Single zone: entire content area
|
||||
return [PageZone(
|
||||
index=0,
|
||||
zone_type='content',
|
||||
y=content_y,
|
||||
height=content_h,
|
||||
x=content_x,
|
||||
width=content_w,
|
||||
)]
|
||||
|
||||
zones: List[PageZone] = []
|
||||
zone_idx = 0
|
||||
cursor_y = content_y
|
||||
content_bottom = content_y + content_h
|
||||
|
||||
for box in boxes:
|
||||
# Content zone above this box
|
||||
gap_above = box.y - cursor_y
|
||||
if gap_above >= min_zone_height:
|
||||
zones.append(PageZone(
|
||||
index=zone_idx,
|
||||
zone_type='content',
|
||||
y=cursor_y,
|
||||
height=gap_above,
|
||||
x=content_x,
|
||||
width=content_w,
|
||||
))
|
||||
zone_idx += 1
|
||||
|
||||
# Box zone
|
||||
zones.append(PageZone(
|
||||
index=zone_idx,
|
||||
zone_type='box',
|
||||
y=box.y,
|
||||
height=box.height,
|
||||
x=box.x,
|
||||
width=box.width,
|
||||
box=box,
|
||||
))
|
||||
zone_idx += 1
|
||||
|
||||
cursor_y = box.y + box.height
|
||||
|
||||
# Content zone below last box
|
||||
remaining = content_bottom - cursor_y
|
||||
if remaining >= min_zone_height:
|
||||
zones.append(PageZone(
|
||||
index=zone_idx,
|
||||
zone_type='content',
|
||||
y=cursor_y,
|
||||
height=remaining,
|
||||
x=content_x,
|
||||
width=content_w,
|
||||
))
|
||||
|
||||
logger.info(f"ZoneSplit: {len(zones)} zones from {len(boxes)} box(es): "
|
||||
f"{[z.zone_type for z in zones]}")
|
||||
|
||||
return zones
|
||||
@@ -0,0 +1,339 @@
|
||||
"""
|
||||
Box layout classifier — detects internal layout type of embedded boxes.
|
||||
|
||||
Classifies each box as: flowing | columnar | bullet_list | header_only
|
||||
and provides layout-appropriate grid building.
|
||||
|
||||
Used by the Box-Grid-Review step to rebuild box zones with correct structure.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import statistics
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Bullet / list-item patterns at the start of a line
|
||||
_BULLET_RE = re.compile(
|
||||
r'^[\-\u2022\u2013\u2014\u25CF\u25CB\u25AA\u25A0•·]\s' # dash, bullet chars
|
||||
r'|^\d{1,2}[.)]\s' # numbered: "1) " or "1. "
|
||||
r'|^[a-z][.)]\s' # lettered: "a) " or "a. "
|
||||
)
|
||||
|
||||
|
||||
def classify_box_layout(
|
||||
words: List[Dict],
|
||||
box_w: int,
|
||||
box_h: int,
|
||||
) -> str:
|
||||
"""Classify the internal layout of a detected box.
|
||||
|
||||
Args:
|
||||
words: OCR word dicts within the box (with top, left, width, height, text)
|
||||
box_w: Box width in pixels
|
||||
box_h: Box height in pixels
|
||||
|
||||
Returns:
|
||||
'header_only' | 'bullet_list' | 'columnar' | 'flowing'
|
||||
"""
|
||||
if not words:
|
||||
return "header_only"
|
||||
|
||||
# Group words into lines by y-proximity
|
||||
lines = _group_into_lines(words)
|
||||
|
||||
# Header only: very few words or single line
|
||||
total_words = sum(len(line) for line in lines)
|
||||
if total_words <= 5 or len(lines) <= 1:
|
||||
return "header_only"
|
||||
|
||||
# Bullet list: check if majority of lines start with bullet patterns
|
||||
bullet_count = 0
|
||||
for line in lines:
|
||||
first_text = line[0].get("text", "") if line else ""
|
||||
if _BULLET_RE.match(first_text):
|
||||
bullet_count += 1
|
||||
# Also check if first word IS a bullet char
|
||||
elif first_text.strip() in ("-", "–", "—", "•", "·", "▪", "▸"):
|
||||
bullet_count += 1
|
||||
if bullet_count >= len(lines) * 0.4 and bullet_count >= 2:
|
||||
return "bullet_list"
|
||||
|
||||
# Columnar: check for multiple distinct x-clusters
|
||||
if len(lines) >= 3 and _has_column_structure(words, box_w):
|
||||
return "columnar"
|
||||
|
||||
# Default: flowing text
|
||||
return "flowing"
|
||||
|
||||
|
||||
def _group_into_lines(words: List[Dict]) -> List[List[Dict]]:
|
||||
"""Group words into lines by y-proximity."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w["top"], w["left"]))
|
||||
heights = [w["height"] for w in sorted_words if w.get("height", 0) > 0]
|
||||
median_h = statistics.median(heights) if heights else 20
|
||||
y_tolerance = max(median_h * 0.5, 5)
|
||||
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]["top"]
|
||||
|
||||
for w in sorted_words[1:]:
|
||||
if abs(w["top"] - current_y) <= y_tolerance:
|
||||
current_line.append(w)
|
||||
else:
|
||||
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
|
||||
current_line = [w]
|
||||
current_y = w["top"]
|
||||
|
||||
if current_line:
|
||||
lines.append(sorted(current_line, key=lambda ww: ww["left"]))
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _has_column_structure(words: List[Dict], box_w: int) -> bool:
|
||||
"""Check if words have multiple distinct left-edge clusters (columns)."""
|
||||
if box_w <= 0:
|
||||
return False
|
||||
|
||||
lines = _group_into_lines(words)
|
||||
if len(lines) < 3:
|
||||
return False
|
||||
|
||||
# Collect left-edges of non-first words in each line
|
||||
# (first word of each line often aligns regardless of columns)
|
||||
left_edges = []
|
||||
for line in lines:
|
||||
for w in line[1:]: # skip first word
|
||||
left_edges.append(w["left"])
|
||||
|
||||
if len(left_edges) < 4:
|
||||
return False
|
||||
|
||||
# Check if left edges cluster into 2+ distinct groups
|
||||
left_edges.sort()
|
||||
gaps = [left_edges[i + 1] - left_edges[i] for i in range(len(left_edges) - 1)]
|
||||
if not gaps:
|
||||
return False
|
||||
|
||||
median_gap = statistics.median(gaps)
|
||||
# A column gap is typically > 15% of box width
|
||||
column_gap_threshold = box_w * 0.15
|
||||
large_gaps = [g for g in gaps if g > column_gap_threshold]
|
||||
|
||||
return len(large_gaps) >= 1
|
||||
|
||||
|
||||
def build_box_zone_grid(
|
||||
zone_words: List[Dict],
|
||||
box_x: int,
|
||||
box_y: int,
|
||||
box_w: int,
|
||||
box_h: int,
|
||||
zone_index: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
layout_type: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build a grid for a box zone with layout-aware processing.
|
||||
|
||||
If layout_type is None, auto-detects it.
|
||||
For 'flowing' and 'bullet_list', forces single-column layout.
|
||||
For 'columnar', uses the standard multi-column detection.
|
||||
For 'header_only', creates a single cell.
|
||||
|
||||
Returns the same format as _build_zone_grid (columns, rows, cells, header_rows).
|
||||
"""
|
||||
from grid_editor_helpers import _build_zone_grid, _cluster_rows
|
||||
|
||||
if not zone_words:
|
||||
return {
|
||||
"columns": [],
|
||||
"rows": [],
|
||||
"cells": [],
|
||||
"header_rows": [],
|
||||
"box_layout_type": layout_type or "header_only",
|
||||
"box_grid_reviewed": False,
|
||||
}
|
||||
|
||||
# Auto-detect layout if not specified
|
||||
if not layout_type:
|
||||
layout_type = classify_box_layout(zone_words, box_w, box_h)
|
||||
|
||||
logger.info(
|
||||
"Box zone %d: layout_type=%s, %d words, %dx%d",
|
||||
zone_index, layout_type, len(zone_words), box_w, box_h,
|
||||
)
|
||||
|
||||
if layout_type == "header_only":
|
||||
# Single cell with all text concatenated
|
||||
all_text = " ".join(
|
||||
w.get("text", "") for w in sorted(zone_words, key=lambda ww: (ww["top"], ww["left"]))
|
||||
).strip()
|
||||
return {
|
||||
"columns": [{"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
|
||||
"x_min_px": box_x, "x_max_px": box_x + box_w,
|
||||
"x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
|
||||
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
|
||||
"bold": False}],
|
||||
"rows": [{"index": 0, "row_index": 0,
|
||||
"y_min": box_y, "y_max": box_y + box_h, "y_center": box_y + box_h / 2,
|
||||
"y_min_px": box_y, "y_max_px": box_y + box_h,
|
||||
"y_min_pct": round(box_y / img_h * 100, 2) if img_h else 0,
|
||||
"y_max_pct": round((box_y + box_h) / img_h * 100, 2) if img_h else 0,
|
||||
"is_header": True}],
|
||||
"cells": [{
|
||||
"cell_id": f"Z{zone_index}_R0C0",
|
||||
"row_index": 0,
|
||||
"col_index": 0,
|
||||
"col_type": "column_1",
|
||||
"text": all_text,
|
||||
"word_boxes": zone_words,
|
||||
}],
|
||||
"header_rows": [0],
|
||||
"box_layout_type": layout_type,
|
||||
"box_grid_reviewed": False,
|
||||
}
|
||||
|
||||
if layout_type in ("flowing", "bullet_list"):
|
||||
# Force single column — each line becomes one row with one cell.
|
||||
# Detect bullet structure from indentation and merge continuation
|
||||
# lines into the bullet they belong to.
|
||||
lines = _group_into_lines(zone_words)
|
||||
column = {
|
||||
"col_index": 0, "index": 0, "label": "column_text", "col_type": "column_1",
|
||||
"x_min_px": box_x, "x_max_px": box_x + box_w,
|
||||
"x_min_pct": round(box_x / img_w * 100, 2) if img_w else 0,
|
||||
"x_max_pct": round((box_x + box_w) / img_w * 100, 2) if img_w else 0,
|
||||
"bold": False,
|
||||
}
|
||||
|
||||
# --- Detect indentation levels ---
|
||||
line_indents = []
|
||||
for line_words in lines:
|
||||
if not line_words:
|
||||
line_indents.append(0)
|
||||
continue
|
||||
min_left = min(w["left"] for w in line_words)
|
||||
line_indents.append(min_left - box_x)
|
||||
|
||||
# Find the minimum indent (= bullet/main level)
|
||||
valid_indents = [ind for ind in line_indents if ind >= 0]
|
||||
min_indent = min(valid_indents) if valid_indents else 0
|
||||
|
||||
# Indentation threshold: lines indented > 15px more than minimum
|
||||
# are continuation lines belonging to the previous bullet
|
||||
INDENT_THRESHOLD = 15
|
||||
|
||||
# --- Group lines into logical items (bullet + continuations) ---
|
||||
# Each item is a list of line indices
|
||||
items: List[List[int]] = []
|
||||
for li, indent in enumerate(line_indents):
|
||||
is_continuation = (indent > min_indent + INDENT_THRESHOLD) and len(items) > 0
|
||||
if is_continuation:
|
||||
items[-1].append(li)
|
||||
else:
|
||||
items.append([li])
|
||||
|
||||
logger.info(
|
||||
"Box zone %d flowing: %d lines → %d items (indents=%s, min=%d, threshold=%d)",
|
||||
zone_index, len(lines), len(items),
|
||||
[int(i) for i in line_indents], int(min_indent), INDENT_THRESHOLD,
|
||||
)
|
||||
|
||||
# --- Build rows and cells from grouped items ---
|
||||
rows = []
|
||||
cells = []
|
||||
header_rows = []
|
||||
|
||||
for row_idx, item_line_indices in enumerate(items):
|
||||
# Collect all words from all lines in this item
|
||||
item_words = []
|
||||
item_texts = []
|
||||
for li in item_line_indices:
|
||||
if li < len(lines):
|
||||
item_words.extend(lines[li])
|
||||
line_text = " ".join(w.get("text", "") for w in lines[li]).strip()
|
||||
if line_text:
|
||||
item_texts.append(line_text)
|
||||
|
||||
if not item_words:
|
||||
continue
|
||||
|
||||
y_min = min(w["top"] for w in item_words)
|
||||
y_max = max(w["top"] + w["height"] for w in item_words)
|
||||
y_center = (y_min + y_max) / 2
|
||||
|
||||
row = {
|
||||
"index": row_idx,
|
||||
"row_index": row_idx,
|
||||
"y_min": y_min,
|
||||
"y_max": y_max,
|
||||
"y_center": y_center,
|
||||
"y_min_px": y_min,
|
||||
"y_max_px": y_max,
|
||||
"y_min_pct": round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
"y_max_pct": round(y_max / img_h * 100, 2) if img_h else 0,
|
||||
"is_header": False,
|
||||
}
|
||||
rows.append(row)
|
||||
|
||||
# Join multi-line text with newline for display
|
||||
merged_text = "\n".join(item_texts)
|
||||
|
||||
# Add bullet marker if this is a bullet item without one
|
||||
first_text = item_texts[0] if item_texts else ""
|
||||
is_bullet = len(item_line_indices) > 1 or _BULLET_RE.match(first_text)
|
||||
if is_bullet and not _BULLET_RE.match(first_text) and row_idx > 0:
|
||||
# Continuation item without bullet — add one
|
||||
merged_text = "• " + merged_text
|
||||
|
||||
cell = {
|
||||
"cell_id": f"Z{zone_index}_R{row_idx}C0",
|
||||
"row_index": row_idx,
|
||||
"col_index": 0,
|
||||
"col_type": "column_1",
|
||||
"text": merged_text,
|
||||
"word_boxes": item_words,
|
||||
}
|
||||
cells.append(cell)
|
||||
|
||||
# Detect header: first item if it has no continuation lines and is short
|
||||
if len(items) >= 2:
|
||||
first_item_texts = []
|
||||
for li in items[0]:
|
||||
if li < len(lines):
|
||||
first_item_texts.append(" ".join(w.get("text", "") for w in lines[li]).strip())
|
||||
first_text = " ".join(first_item_texts)
|
||||
if (len(first_text) < 40
|
||||
or first_text.isupper()
|
||||
or first_text.rstrip().endswith(':')):
|
||||
header_rows = [0]
|
||||
|
||||
return {
|
||||
"columns": [column],
|
||||
"rows": rows,
|
||||
"cells": cells,
|
||||
"header_rows": header_rows,
|
||||
"box_layout_type": layout_type,
|
||||
"box_grid_reviewed": False,
|
||||
}
|
||||
|
||||
# Columnar: use standard grid builder with independent column detection
|
||||
result = _build_zone_grid(
|
||||
zone_words, box_x, box_y, box_w, box_h,
|
||||
zone_index, img_w, img_h,
|
||||
global_columns=None, # detect columns independently
|
||||
)
|
||||
|
||||
# Colspan detection is now handled generically by _detect_colspan_cells
|
||||
# in grid_editor_helpers.py (called inside _build_zone_grid).
|
||||
|
||||
result["box_layout_type"] = layout_type
|
||||
result["box_grid_reviewed"] = False
|
||||
return result
|
||||
@@ -0,0 +1,312 @@
|
||||
"""
|
||||
Color detection for OCR word boxes.
|
||||
|
||||
Detects the text color of existing OCR words and recovers colored text
|
||||
regions (e.g. red markers, blue headings) that standard OCR may have missed.
|
||||
|
||||
Standard OCR (Tesseract, PaddleOCR) binarises images before processing,
|
||||
destroying all color information. This module adds it back by sampling
|
||||
HSV pixel values at word-box positions and finding colored regions that
|
||||
no word-box covers.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HSV color ranges (OpenCV: H 0-180, S 0-255, V 0-255)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_COLOR_RANGES: Dict[str, List[Tuple[np.ndarray, np.ndarray]]] = {
|
||||
"red": [
|
||||
(np.array([0, 70, 50]), np.array([10, 255, 255])),
|
||||
(np.array([170, 70, 50]), np.array([180, 255, 255])),
|
||||
],
|
||||
"orange": [
|
||||
(np.array([10, 70, 50]), np.array([25, 255, 255])),
|
||||
],
|
||||
"yellow": [
|
||||
(np.array([25, 70, 50]), np.array([35, 255, 255])),
|
||||
],
|
||||
"green": [
|
||||
(np.array([35, 70, 50]), np.array([85, 255, 255])),
|
||||
],
|
||||
"blue": [
|
||||
(np.array([100, 70, 50]), np.array([130, 255, 255])),
|
||||
],
|
||||
"purple": [
|
||||
(np.array([130, 70, 50]), np.array([170, 255, 255])),
|
||||
],
|
||||
}
|
||||
|
||||
_COLOR_HEX: Dict[str, str] = {
|
||||
"black": "#000000",
|
||||
"gray": "#6b7280",
|
||||
"red": "#dc2626",
|
||||
"orange": "#ea580c",
|
||||
"yellow": "#ca8a04",
|
||||
"green": "#16a34a",
|
||||
"blue": "#2563eb",
|
||||
"purple": "#9333ea",
|
||||
}
|
||||
|
||||
|
||||
def _hue_to_color_name(hue: float) -> str:
|
||||
"""Map OpenCV hue (0-180) to a color name."""
|
||||
if hue < 10 or hue > 170:
|
||||
return "red"
|
||||
if hue < 25:
|
||||
return "orange"
|
||||
if hue < 35:
|
||||
return "yellow"
|
||||
if hue < 85:
|
||||
return "green"
|
||||
if hue < 130:
|
||||
return "blue"
|
||||
return "purple"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Color annotation for existing word boxes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def detect_word_colors(
|
||||
img_bgr: np.ndarray,
|
||||
word_boxes: List[Dict],
|
||||
sat_threshold: int = 55,
|
||||
min_sat_ratio: float = 0.25,
|
||||
) -> None:
|
||||
"""Annotate each word_box in-place with its detected text color.
|
||||
|
||||
Adds ``color`` (hex string) and ``color_name`` (e.g. 'red', 'black')
|
||||
keys to each dict.
|
||||
|
||||
Algorithm per word:
|
||||
1. Crop the word region from the image.
|
||||
2. Otsu-threshold for text/background separation.
|
||||
3. Sample background color from border pixels of the crop.
|
||||
4. Remove text pixels that match the background (avoids colored
|
||||
backgrounds like blue boxes leaking into the result).
|
||||
5. Use **median** hue (robust to outliers) and require a minimum
|
||||
ratio of saturated pixels before classifying as colored.
|
||||
"""
|
||||
if img_bgr is None or not word_boxes:
|
||||
return
|
||||
|
||||
img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
|
||||
colored_count = 0
|
||||
|
||||
for wb in word_boxes:
|
||||
x1 = max(0, int(wb["left"]))
|
||||
y1 = max(0, int(wb["top"]))
|
||||
x2 = min(img_w, int(wb["left"] + wb["width"]))
|
||||
y2 = min(img_h, int(wb["top"] + wb["height"]))
|
||||
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
wb["color"] = _COLOR_HEX["black"]
|
||||
wb["color_name"] = "black"
|
||||
continue
|
||||
|
||||
crop_hsv = img_hsv[y1:y2, x1:x2]
|
||||
crop_bgr = img_bgr[y1:y2, x1:x2]
|
||||
crop_gray = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2GRAY)
|
||||
ch, cw = crop_hsv.shape[:2]
|
||||
|
||||
# --- Text mask: Otsu (adaptive) + high-saturation pixels ---
|
||||
_, dark_mask = cv2.threshold(
|
||||
crop_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
|
||||
)
|
||||
sat_mask = (crop_hsv[:, :, 1] > sat_threshold).astype(np.uint8) * 255
|
||||
text_mask = cv2.bitwise_or(dark_mask, sat_mask)
|
||||
|
||||
text_pixels = crop_hsv[text_mask > 0]
|
||||
|
||||
if len(text_pixels) < 3:
|
||||
wb["color"] = _COLOR_HEX["black"]
|
||||
wb["color_name"] = "black"
|
||||
continue
|
||||
|
||||
# --- Background subtraction via border pixels ---
|
||||
# Sample background from the 2px border ring of the crop
|
||||
if ch > 6 and cw > 6:
|
||||
border = 2
|
||||
bg_top = crop_hsv[:border, :].reshape(-1, 3)
|
||||
bg_bot = crop_hsv[-border:, :].reshape(-1, 3)
|
||||
bg_lft = crop_hsv[border:-border, :border].reshape(-1, 3)
|
||||
bg_rgt = crop_hsv[border:-border, -border:].reshape(-1, 3)
|
||||
bg_pixels = np.vstack([bg_top, bg_bot, bg_lft, bg_rgt])
|
||||
|
||||
bg_med_h = float(np.median(bg_pixels[:, 0]))
|
||||
bg_med_s = float(np.median(bg_pixels[:, 1]))
|
||||
|
||||
# If background is tinted (S > 15), remove text pixels
|
||||
# with similar hue to avoid false colored detections
|
||||
if bg_med_s > 15:
|
||||
hue_diff = np.minimum(
|
||||
np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
|
||||
180.0 - np.abs(text_pixels[:, 0].astype(float) - bg_med_h),
|
||||
)
|
||||
keep = hue_diff > 20
|
||||
if np.any(keep):
|
||||
text_pixels = text_pixels[keep]
|
||||
|
||||
if len(text_pixels) < 3:
|
||||
wb["color"] = _COLOR_HEX["black"]
|
||||
wb["color_name"] = "black"
|
||||
continue
|
||||
|
||||
# --- Classification using MEDIAN (robust to outliers) ---
|
||||
median_sat = float(np.median(text_pixels[:, 1]))
|
||||
sat_count = int(np.sum(text_pixels[:, 1] > sat_threshold))
|
||||
sat_ratio = sat_count / len(text_pixels)
|
||||
|
||||
if median_sat < sat_threshold or sat_ratio < min_sat_ratio:
|
||||
wb["color"] = _COLOR_HEX["black"]
|
||||
wb["color_name"] = "black"
|
||||
else:
|
||||
# Use median hue of saturated pixels only for cleaner signal
|
||||
sat_pixels = text_pixels[text_pixels[:, 1] > sat_threshold]
|
||||
median_hue = float(np.median(sat_pixels[:, 0]))
|
||||
name = _hue_to_color_name(median_hue)
|
||||
|
||||
# Red requires higher saturation — scanner artifacts on black
|
||||
# text often produce a slight warm tint (hue ~0) with low
|
||||
# saturation that would otherwise be misclassified as red.
|
||||
if name == "red" and median_sat < 90:
|
||||
wb["color"] = _COLOR_HEX["black"]
|
||||
wb["color_name"] = "black"
|
||||
continue
|
||||
|
||||
wb["color"] = _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
||||
wb["color_name"] = name
|
||||
colored_count += 1
|
||||
|
||||
if colored_count:
|
||||
logger.info("color annotation: %d / %d words are colored",
|
||||
colored_count, len(word_boxes))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Recover colored text that OCR missed
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def recover_colored_text(
|
||||
img_bgr: np.ndarray,
|
||||
existing_words: List[Dict],
|
||||
min_area: int = 40,
|
||||
max_regions: int = 60,
|
||||
) -> List[Dict]:
|
||||
"""Find colored text regions not covered by any existing word box.
|
||||
|
||||
Returns a list of recovered word dicts with ``color``, ``color_name``,
|
||||
and ``recovered=True`` fields. The ``text`` is set via a lightweight
|
||||
shape heuristic (e.g. ``!`` for tall narrow shapes) or ``?``.
|
||||
"""
|
||||
if img_bgr is None:
|
||||
return []
|
||||
|
||||
img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
ih, iw = img_bgr.shape[:2]
|
||||
max_area = int(ih * iw * 0.005)
|
||||
|
||||
# --- Build occupancy mask from existing words (adaptive padding) ---
|
||||
# Pad word boxes generously to prevent colored-pixel artifacts in
|
||||
# narrow inter-word gaps from being recovered as false characters.
|
||||
heights = [wb["height"] for wb in existing_words if wb.get("height", 0) > 0]
|
||||
median_h = int(np.median(heights)) if heights else 20
|
||||
pad = max(8, int(median_h * 0.35))
|
||||
|
||||
occupied = np.zeros((ih, iw), dtype=np.uint8)
|
||||
for wb in existing_words:
|
||||
x1 = max(0, int(wb["left"]) - pad)
|
||||
y1 = max(0, int(wb["top"]) - pad)
|
||||
x2 = min(iw, int(wb["left"] + wb["width"]) + pad)
|
||||
y2 = min(ih, int(wb["top"] + wb["height"]) + pad)
|
||||
occupied[y1:y2, x1:x2] = 255
|
||||
|
||||
recovered: List[Dict] = []
|
||||
|
||||
for color_name, ranges in _COLOR_RANGES.items():
|
||||
# Create mask for this color
|
||||
mask = np.zeros((ih, iw), dtype=np.uint8)
|
||||
for lower, upper in ranges:
|
||||
mask = cv2.bitwise_or(mask, cv2.inRange(img_hsv, lower, upper))
|
||||
|
||||
# Remove pixels already covered by existing OCR words
|
||||
mask = cv2.bitwise_and(mask, cv2.bitwise_not(occupied))
|
||||
|
||||
# Morphological cleanup:
|
||||
# - Close with tall kernel to merge ! stroke + dot
|
||||
# - Open to remove noise specks
|
||||
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 8))
|
||||
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close)
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
||||
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
contours, _ = cv2.findContours(
|
||||
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
|
||||
candidates = []
|
||||
for cnt in contours:
|
||||
area = cv2.contourArea(cnt)
|
||||
if area < min_area or area > max_area:
|
||||
continue
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
if bh < 6:
|
||||
continue
|
||||
# Reject regions too wide to be single characters
|
||||
if bw > median_h * 4:
|
||||
continue
|
||||
candidates.append((area, bx, by, bw, bh))
|
||||
|
||||
# Keep largest first, limited count
|
||||
candidates.sort(key=lambda c: c[0], reverse=True)
|
||||
|
||||
for area, bx, by, bw, bh in candidates[:max_regions]:
|
||||
text = _identify_shape(bw, bh)
|
||||
recovered.append({
|
||||
"text": text,
|
||||
"left": bx,
|
||||
"top": by,
|
||||
"width": bw,
|
||||
"height": bh,
|
||||
"conf": 45,
|
||||
"color": _COLOR_HEX.get(color_name, "#000000"),
|
||||
"color_name": color_name,
|
||||
"recovered": True,
|
||||
})
|
||||
|
||||
if recovered:
|
||||
logger.info(
|
||||
"color recovery: %d colored regions found (%s)",
|
||||
len(recovered),
|
||||
", ".join(
|
||||
f"{c}: {sum(1 for r in recovered if r['color_name'] == c)}"
|
||||
for c in sorted({r["color_name"] for r in recovered})
|
||||
),
|
||||
)
|
||||
|
||||
return recovered
|
||||
|
||||
|
||||
def _identify_shape(w: int, h: int) -> str:
|
||||
"""Simple shape heuristic for common single-character text markers."""
|
||||
aspect = w / h if h > 0 else 1.0
|
||||
if aspect < 0.55 and h > 10:
|
||||
# Tall, narrow — likely exclamation mark
|
||||
return "!"
|
||||
if 0.6 < aspect < 1.5 and max(w, h) < 25:
|
||||
# Small, roughly square — bullet or dot
|
||||
return "•"
|
||||
return "?"
|
||||
@@ -0,0 +1,413 @@
|
||||
"""
|
||||
PP-DocLayout ONNX Document Layout Detection.
|
||||
|
||||
Uses PP-DocLayout ONNX model to detect document structure regions:
|
||||
table, figure, title, text, list, header, footer, equation, reference, abstract
|
||||
|
||||
Fallback: If ONNX model not available, returns empty list (caller should
|
||||
fall back to OpenCV-based detection in cv_graphic_detect.py).
|
||||
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = [
|
||||
"detect_layout_regions",
|
||||
"is_doclayout_available",
|
||||
"get_doclayout_status",
|
||||
"LayoutRegion",
|
||||
"DOCLAYOUT_CLASSES",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Class labels (PP-DocLayout default order)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DOCLAYOUT_CLASSES = [
|
||||
"table", "figure", "title", "text", "list",
|
||||
"header", "footer", "equation", "reference", "abstract",
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class LayoutRegion:
|
||||
"""A detected document layout region."""
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
label: str # table, figure, title, text, list, etc.
|
||||
confidence: float
|
||||
label_index: int # raw class index
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ONNX model loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_MODEL_SEARCH_PATHS = [
|
||||
# 1. Explicit environment variable
|
||||
os.environ.get("DOCLAYOUT_ONNX_PATH", ""),
|
||||
# 2. Docker default cache path
|
||||
"/root/.cache/huggingface/onnx/pp-doclayout/model.onnx",
|
||||
# 3. Local dev relative to working directory
|
||||
"models/onnx/pp-doclayout/model.onnx",
|
||||
]
|
||||
|
||||
_onnx_session: Optional[object] = None
|
||||
_model_path: Optional[str] = None
|
||||
_load_attempted: bool = False
|
||||
_load_error: Optional[str] = None
|
||||
|
||||
|
||||
def _find_model_path() -> Optional[str]:
|
||||
"""Search for the ONNX model file in known locations."""
|
||||
for p in _MODEL_SEARCH_PATHS:
|
||||
if p and Path(p).is_file():
|
||||
return str(Path(p).resolve())
|
||||
return None
|
||||
|
||||
|
||||
def _load_onnx_session():
|
||||
"""Lazy-load the ONNX runtime session (once)."""
|
||||
global _onnx_session, _model_path, _load_attempted, _load_error
|
||||
|
||||
if _load_attempted:
|
||||
return _onnx_session
|
||||
|
||||
_load_attempted = True
|
||||
|
||||
path = _find_model_path()
|
||||
if path is None:
|
||||
_load_error = "ONNX model not found in any search path"
|
||||
logger.info("PP-DocLayout: %s", _load_error)
|
||||
return None
|
||||
|
||||
try:
|
||||
import onnxruntime as ort # type: ignore[import-untyped]
|
||||
|
||||
sess_options = ort.SessionOptions()
|
||||
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
||||
# Prefer CPU – keeps the GPU free for OCR / LLM.
|
||||
providers = ["CPUExecutionProvider"]
|
||||
_onnx_session = ort.InferenceSession(path, sess_options, providers=providers)
|
||||
_model_path = path
|
||||
logger.info("PP-DocLayout: model loaded from %s", path)
|
||||
except ImportError:
|
||||
_load_error = "onnxruntime not installed"
|
||||
logger.info("PP-DocLayout: %s", _load_error)
|
||||
except Exception as exc:
|
||||
_load_error = str(exc)
|
||||
logger.warning("PP-DocLayout: failed to load model from %s: %s", path, exc)
|
||||
|
||||
return _onnx_session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def is_doclayout_available() -> bool:
|
||||
"""Return True if the ONNX model can be loaded successfully."""
|
||||
return _load_onnx_session() is not None
|
||||
|
||||
|
||||
def get_doclayout_status() -> Dict:
|
||||
"""Return diagnostic information about the DocLayout backend."""
|
||||
_load_onnx_session() # ensure we tried
|
||||
return {
|
||||
"available": _onnx_session is not None,
|
||||
"model_path": _model_path,
|
||||
"load_error": _load_error,
|
||||
"classes": DOCLAYOUT_CLASSES,
|
||||
"class_count": len(DOCLAYOUT_CLASSES),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pre-processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_INPUT_SIZE = 800 # PP-DocLayout expects 800x800
|
||||
|
||||
|
||||
def preprocess_image(img_bgr: np.ndarray) -> tuple:
|
||||
"""Resize + normalize image for PP-DocLayout ONNX input.
|
||||
|
||||
Returns:
|
||||
(input_tensor, scale_x, scale_y, pad_x, pad_y)
|
||||
where scale/pad allow mapping boxes back to original coords.
|
||||
"""
|
||||
orig_h, orig_w = img_bgr.shape[:2]
|
||||
|
||||
# Compute scale to fit within _INPUT_SIZE keeping aspect ratio
|
||||
scale = min(_INPUT_SIZE / orig_w, _INPUT_SIZE / orig_h)
|
||||
new_w = int(orig_w * scale)
|
||||
new_h = int(orig_h * scale)
|
||||
|
||||
import cv2 # local import — cv2 is always available in this service
|
||||
resized = cv2.resize(img_bgr, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Pad to _INPUT_SIZE x _INPUT_SIZE with gray (114)
|
||||
pad_x = (_INPUT_SIZE - new_w) // 2
|
||||
pad_y = (_INPUT_SIZE - new_h) // 2
|
||||
padded = np.full((_INPUT_SIZE, _INPUT_SIZE, 3), 114, dtype=np.uint8)
|
||||
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
|
||||
|
||||
# Normalize to [0, 1] float32
|
||||
blob = padded.astype(np.float32) / 255.0
|
||||
|
||||
# HWC → CHW
|
||||
blob = blob.transpose(2, 0, 1)
|
||||
|
||||
# Add batch dimension → (1, 3, 800, 800)
|
||||
blob = np.expand_dims(blob, axis=0)
|
||||
|
||||
return blob, scale, pad_x, pad_y
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Non-Maximum Suppression (NMS)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _compute_iou(box_a: np.ndarray, box_b: np.ndarray) -> float:
|
||||
"""Compute IoU between two boxes [x1, y1, x2, y2]."""
|
||||
ix1 = max(box_a[0], box_b[0])
|
||||
iy1 = max(box_a[1], box_b[1])
|
||||
ix2 = min(box_a[2], box_b[2])
|
||||
iy2 = min(box_a[3], box_b[3])
|
||||
|
||||
inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
|
||||
if inter == 0:
|
||||
return 0.0
|
||||
|
||||
area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
|
||||
area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
|
||||
union = area_a + area_b - inter
|
||||
return inter / union if union > 0 else 0.0
|
||||
|
||||
|
||||
def nms(boxes: np.ndarray, scores: np.ndarray, iou_threshold: float = 0.5) -> List[int]:
|
||||
"""Apply greedy Non-Maximum Suppression.
|
||||
|
||||
Args:
|
||||
boxes: (N, 4) array of [x1, y1, x2, y2].
|
||||
scores: (N,) confidence scores.
|
||||
iou_threshold: Overlap threshold for suppression.
|
||||
|
||||
Returns:
|
||||
List of kept indices.
|
||||
"""
|
||||
if len(boxes) == 0:
|
||||
return []
|
||||
|
||||
order = np.argsort(scores)[::-1].tolist()
|
||||
keep: List[int] = []
|
||||
|
||||
while order:
|
||||
i = order.pop(0)
|
||||
keep.append(i)
|
||||
remaining = []
|
||||
for j in order:
|
||||
if _compute_iou(boxes[i], boxes[j]) < iou_threshold:
|
||||
remaining.append(j)
|
||||
order = remaining
|
||||
|
||||
return keep
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Post-processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _postprocess(
|
||||
outputs: list,
|
||||
scale: float,
|
||||
pad_x: int,
|
||||
pad_y: int,
|
||||
orig_w: int,
|
||||
orig_h: int,
|
||||
confidence_threshold: float,
|
||||
max_regions: int,
|
||||
) -> List[LayoutRegion]:
|
||||
"""Parse ONNX output tensors into LayoutRegion list.
|
||||
|
||||
PP-DocLayout ONNX typically outputs one tensor of shape
|
||||
(1, N, 6) or three tensors (boxes, scores, class_ids).
|
||||
We handle both common formats.
|
||||
"""
|
||||
regions: List[LayoutRegion] = []
|
||||
|
||||
# --- Determine output format ---
|
||||
if len(outputs) == 1:
|
||||
# Single tensor: (1, N, 4+1+1) = (batch, detections, [x1,y1,x2,y2,score,class])
|
||||
raw = np.squeeze(outputs[0]) # (N, 6) or (N, 5+num_classes)
|
||||
if raw.ndim == 1:
|
||||
raw = raw.reshape(1, -1)
|
||||
if raw.shape[0] == 0:
|
||||
return []
|
||||
|
||||
if raw.shape[1] == 6:
|
||||
# Format: x1, y1, x2, y2, score, class_id
|
||||
all_boxes = raw[:, :4]
|
||||
all_scores = raw[:, 4]
|
||||
all_classes = raw[:, 5].astype(int)
|
||||
elif raw.shape[1] > 6:
|
||||
# Format: x1, y1, x2, y2, obj_conf, cls0_conf, cls1_conf, ...
|
||||
all_boxes = raw[:, :4]
|
||||
cls_scores = raw[:, 5:]
|
||||
all_classes = np.argmax(cls_scores, axis=1)
|
||||
all_scores = raw[:, 4] * np.max(cls_scores, axis=1)
|
||||
else:
|
||||
logger.warning("PP-DocLayout: unexpected output shape %s", raw.shape)
|
||||
return []
|
||||
|
||||
elif len(outputs) == 3:
|
||||
# Three tensors: boxes (N,4), scores (N,), class_ids (N,)
|
||||
all_boxes = np.squeeze(outputs[0])
|
||||
all_scores = np.squeeze(outputs[1])
|
||||
all_classes = np.squeeze(outputs[2]).astype(int)
|
||||
if all_boxes.ndim == 1:
|
||||
all_boxes = all_boxes.reshape(1, 4)
|
||||
all_scores = np.array([all_scores])
|
||||
all_classes = np.array([all_classes])
|
||||
else:
|
||||
logger.warning("PP-DocLayout: unexpected %d output tensors", len(outputs))
|
||||
return []
|
||||
|
||||
# --- Confidence filter ---
|
||||
mask = all_scores >= confidence_threshold
|
||||
boxes = all_boxes[mask]
|
||||
scores = all_scores[mask]
|
||||
classes = all_classes[mask]
|
||||
|
||||
if len(boxes) == 0:
|
||||
return []
|
||||
|
||||
# --- NMS ---
|
||||
keep_idxs = nms(boxes, scores, iou_threshold=0.5)
|
||||
boxes = boxes[keep_idxs]
|
||||
scores = scores[keep_idxs]
|
||||
classes = classes[keep_idxs]
|
||||
|
||||
# --- Scale boxes back to original image coordinates ---
|
||||
for i in range(len(boxes)):
|
||||
x1, y1, x2, y2 = boxes[i]
|
||||
|
||||
# Remove padding offset
|
||||
x1 = (x1 - pad_x) / scale
|
||||
y1 = (y1 - pad_y) / scale
|
||||
x2 = (x2 - pad_x) / scale
|
||||
y2 = (y2 - pad_y) / scale
|
||||
|
||||
# Clamp to original dimensions
|
||||
x1 = max(0, min(x1, orig_w))
|
||||
y1 = max(0, min(y1, orig_h))
|
||||
x2 = max(0, min(x2, orig_w))
|
||||
y2 = max(0, min(y2, orig_h))
|
||||
|
||||
w = int(round(x2 - x1))
|
||||
h = int(round(y2 - y1))
|
||||
if w < 5 or h < 5:
|
||||
continue
|
||||
|
||||
cls_idx = int(classes[i])
|
||||
label = DOCLAYOUT_CLASSES[cls_idx] if 0 <= cls_idx < len(DOCLAYOUT_CLASSES) else f"class_{cls_idx}"
|
||||
|
||||
regions.append(LayoutRegion(
|
||||
x=int(round(x1)),
|
||||
y=int(round(y1)),
|
||||
width=w,
|
||||
height=h,
|
||||
label=label,
|
||||
confidence=round(float(scores[i]), 4),
|
||||
label_index=cls_idx,
|
||||
))
|
||||
|
||||
# Sort by confidence descending, limit
|
||||
regions.sort(key=lambda r: r.confidence, reverse=True)
|
||||
return regions[:max_regions]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main detection function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def detect_layout_regions(
|
||||
img_bgr: np.ndarray,
|
||||
confidence_threshold: float = 0.5,
|
||||
max_regions: int = 50,
|
||||
) -> List[LayoutRegion]:
|
||||
"""Detect document layout regions using PP-DocLayout ONNX model.
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image (OpenCV format).
|
||||
confidence_threshold: Minimum confidence to keep a detection.
|
||||
max_regions: Maximum number of regions to return.
|
||||
|
||||
Returns:
|
||||
List of LayoutRegion sorted by confidence descending.
|
||||
Returns empty list if model is not available.
|
||||
"""
|
||||
session = _load_onnx_session()
|
||||
if session is None:
|
||||
return []
|
||||
|
||||
if img_bgr is None or img_bgr.size == 0:
|
||||
return []
|
||||
|
||||
orig_h, orig_w = img_bgr.shape[:2]
|
||||
|
||||
# Pre-process
|
||||
input_tensor, scale, pad_x, pad_y = preprocess_image(img_bgr)
|
||||
|
||||
# Run inference
|
||||
try:
|
||||
input_name = session.get_inputs()[0].name
|
||||
outputs = session.run(None, {input_name: input_tensor})
|
||||
except Exception as exc:
|
||||
logger.warning("PP-DocLayout inference failed: %s", exc)
|
||||
return []
|
||||
|
||||
# Post-process
|
||||
regions = _postprocess(
|
||||
outputs,
|
||||
scale=scale,
|
||||
pad_x=pad_x,
|
||||
pad_y=pad_y,
|
||||
orig_w=orig_w,
|
||||
orig_h=orig_h,
|
||||
confidence_threshold=confidence_threshold,
|
||||
max_regions=max_regions,
|
||||
)
|
||||
|
||||
if regions:
|
||||
label_counts: Dict[str, int] = {}
|
||||
for r in regions:
|
||||
label_counts[r.label] = label_counts.get(r.label, 0) + 1
|
||||
logger.info(
|
||||
"PP-DocLayout: %d regions (%s)",
|
||||
len(regions),
|
||||
", ".join(f"{k}: {v}" for k, v in sorted(label_counts.items())),
|
||||
)
|
||||
else:
|
||||
logger.debug("PP-DocLayout: no regions above threshold %.2f", confidence_threshold)
|
||||
|
||||
return regions
|
||||
@@ -0,0 +1,422 @@
|
||||
"""
|
||||
Graphical element detection for OCR pages.
|
||||
|
||||
Region-based approach:
|
||||
1. Build a color mask (saturation channel — black text is invisible).
|
||||
2. Dilate heavily to merge nearby colored pixels into regions.
|
||||
3. For each region, check overlap with OCR word boxes:
|
||||
- High word overlap → colored text (skip)
|
||||
- Low word overlap → colored graphic / image (keep)
|
||||
4. Separately detect large black-ink illustrations via ink mask.
|
||||
|
||||
Boxes and text colors are handled by cv_box_detect / cv_color_detect.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ["detect_graphic_elements", "GraphicElement"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphicElement:
|
||||
"""A detected non-text graphical element."""
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
area: int
|
||||
shape: str # image, illustration
|
||||
color_name: str # dominant color or 'black'
|
||||
color_hex: str
|
||||
confidence: float
|
||||
contour: Any = field(default=None, repr=False)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Color helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_COLOR_HEX = {
|
||||
"black": "#000000",
|
||||
"gray": "#6b7280",
|
||||
"red": "#dc2626",
|
||||
"orange": "#ea580c",
|
||||
"yellow": "#ca8a04",
|
||||
"green": "#16a34a",
|
||||
"blue": "#2563eb",
|
||||
"purple": "#9333ea",
|
||||
}
|
||||
|
||||
|
||||
def _dominant_color(hsv_roi: np.ndarray, sat_threshold: int = 40) -> tuple:
|
||||
"""Return (color_name, color_hex) for an HSV region."""
|
||||
if hsv_roi.size == 0:
|
||||
return "black", _COLOR_HEX["black"]
|
||||
|
||||
pixels = hsv_roi.reshape(-1, 3)
|
||||
sat = pixels[:, 1]
|
||||
sat_mask = sat > sat_threshold
|
||||
sat_ratio = np.sum(sat_mask) / len(pixels) if len(pixels) > 0 else 0
|
||||
|
||||
if sat_ratio < 0.15:
|
||||
return "black", _COLOR_HEX["black"]
|
||||
|
||||
sat_pixels = pixels[sat_mask]
|
||||
if len(sat_pixels) < 3:
|
||||
return "black", _COLOR_HEX["black"]
|
||||
|
||||
med_hue = float(np.median(sat_pixels[:, 0]))
|
||||
|
||||
if med_hue < 10 or med_hue > 170:
|
||||
name = "red"
|
||||
elif med_hue < 25:
|
||||
name = "orange"
|
||||
elif med_hue < 35:
|
||||
name = "yellow"
|
||||
elif med_hue < 85:
|
||||
name = "green"
|
||||
elif med_hue < 130:
|
||||
name = "blue"
|
||||
else:
|
||||
name = "purple"
|
||||
|
||||
return name, _COLOR_HEX.get(name, _COLOR_HEX["black"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def detect_graphic_elements(
|
||||
img_bgr: np.ndarray,
|
||||
word_boxes: List[Dict],
|
||||
detected_boxes: Optional[List[Dict]] = None,
|
||||
max_elements: int = 50,
|
||||
) -> List[GraphicElement]:
|
||||
"""Find non-text graphical regions on the page.
|
||||
|
||||
Region-based: dilate color mask to form regions, then check word
|
||||
overlap to distinguish colored text from colored graphics.
|
||||
|
||||
Args:
|
||||
img_bgr: BGR color image.
|
||||
word_boxes: List of OCR word dicts with left/top/width/height.
|
||||
detected_boxes: Optional list of detected box dicts (x/y/w/h).
|
||||
max_elements: Maximum number of elements to return.
|
||||
|
||||
Returns:
|
||||
List of GraphicElement, sorted by area descending.
|
||||
"""
|
||||
if img_bgr is None:
|
||||
return []
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Try PP-DocLayout ONNX first if available
|
||||
# ------------------------------------------------------------------
|
||||
import os
|
||||
backend = os.environ.get("GRAPHIC_DETECT_BACKEND", "auto")
|
||||
if backend in ("doclayout", "auto"):
|
||||
try:
|
||||
from cv_doclayout_detect import detect_layout_regions, is_doclayout_available
|
||||
if is_doclayout_available():
|
||||
regions = detect_layout_regions(img_bgr)
|
||||
if regions:
|
||||
_LABEL_TO_COLOR = {
|
||||
"figure": ("image", "green", _COLOR_HEX.get("green", "#16a34a")),
|
||||
"table": ("image", "blue", _COLOR_HEX.get("blue", "#2563eb")),
|
||||
}
|
||||
converted: List[GraphicElement] = []
|
||||
for r in regions:
|
||||
shape, color_name, color_hex = _LABEL_TO_COLOR.get(
|
||||
r.label,
|
||||
(r.label, "gray", _COLOR_HEX.get("gray", "#6b7280")),
|
||||
)
|
||||
converted.append(GraphicElement(
|
||||
x=r.x,
|
||||
y=r.y,
|
||||
width=r.width,
|
||||
height=r.height,
|
||||
area=r.width * r.height,
|
||||
shape=shape,
|
||||
color_name=color_name,
|
||||
color_hex=color_hex,
|
||||
confidence=r.confidence,
|
||||
contour=None,
|
||||
))
|
||||
converted.sort(key=lambda g: g.area, reverse=True)
|
||||
result = converted[:max_elements]
|
||||
if result:
|
||||
shape_counts: Dict[str, int] = {}
|
||||
for g in result:
|
||||
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
||||
logger.info(
|
||||
"GraphicDetect (PP-DocLayout): %d elements (%s)",
|
||||
len(result),
|
||||
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning("PP-DocLayout failed, falling back to OpenCV: %s", e)
|
||||
# ------------------------------------------------------------------
|
||||
# OpenCV fallback (original logic)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
h, w = img_bgr.shape[:2]
|
||||
|
||||
logger.debug("GraphicDetect: image %dx%d, %d word_boxes, %d detected_boxes",
|
||||
w, h, len(word_boxes), len(detected_boxes or []))
|
||||
|
||||
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
|
||||
candidates: List[GraphicElement] = []
|
||||
|
||||
# --- Build word mask (for overlap checking) ---
|
||||
word_mask = np.zeros((h, w), dtype=np.uint8)
|
||||
for wb in word_boxes:
|
||||
x1 = max(0, int(wb.get("left", 0)))
|
||||
y1 = max(0, int(wb.get("top", 0)))
|
||||
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)))
|
||||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)))
|
||||
word_mask[y1:y2, x1:x2] = 255
|
||||
|
||||
# =====================================================================
|
||||
# PASS 1 — COLORED IMAGE REGIONS
|
||||
# =====================================================================
|
||||
# Color mask: saturated pixels (black text has sat ≈ 0 → invisible)
|
||||
sat_mask = (hsv[:, :, 1] > 40).astype(np.uint8) * 255
|
||||
val_mask = (hsv[:, :, 2] < 240).astype(np.uint8) * 255
|
||||
color_pixels = cv2.bitwise_and(sat_mask, val_mask)
|
||||
|
||||
# Remove tiny speckle
|
||||
kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
||||
color_pixels = cv2.morphologyEx(color_pixels, cv2.MORPH_OPEN, kernel_open)
|
||||
|
||||
# Count raw colored pixels before dilation (for density check later)
|
||||
color_pixel_raw = color_pixels.copy()
|
||||
|
||||
# Heavy dilation to merge nearby colored elements into regions.
|
||||
# A 25x25 kernel merges elements within ~12px of each other.
|
||||
kernel_dilate = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (25, 25))
|
||||
region_mask = cv2.dilate(color_pixels, kernel_dilate, iterations=1)
|
||||
|
||||
contours_regions, _ = cv2.findContours(
|
||||
region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.debug("GraphicDetect PASS1: %d color regions after dilation", len(contours_regions))
|
||||
|
||||
for cnt in contours_regions:
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
|
||||
# Skip tiny regions
|
||||
if bw < 15 or bh < 15:
|
||||
continue
|
||||
|
||||
# Skip page-spanning regions
|
||||
if bw > w * 0.6 or bh > h * 0.6:
|
||||
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
|
||||
continue
|
||||
|
||||
bbox_area = bw * bh
|
||||
|
||||
# Check: how much of this region's bounding box overlaps with words?
|
||||
roi_words = word_mask[by:by + bh, bx:bx + bw]
|
||||
word_pixel_count = int(np.sum(roi_words > 0))
|
||||
word_overlap = word_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
# Check: how many OCR word centroids fall inside this region?
|
||||
# Colored text that OCR detected will have multiple centroids inside.
|
||||
# Actual images may have 0-1 spurious OCR artifacts.
|
||||
word_centroid_count = sum(
|
||||
1 for wb in word_boxes
|
||||
if (bx <= int(wb.get("left", 0) + wb.get("width", 0) / 2) <= bx + bw
|
||||
and by <= int(wb.get("top", 0) + wb.get("height", 0) / 2) <= by + bh)
|
||||
)
|
||||
|
||||
# Check: how many actual colored pixels are in this region?
|
||||
roi_color = color_pixel_raw[by:by + bh, bx:bx + bw]
|
||||
color_pixel_count = int(np.sum(roi_color > 0))
|
||||
|
||||
# Color pixel density (before any skip checks so we can log it)
|
||||
density = color_pixel_count / bbox_area if bbox_area > 0 else 0
|
||||
|
||||
# --- Skip heuristics for colored TEXT (not images) ---
|
||||
|
||||
# (a) High word-box pixel overlap → clearly text
|
||||
if word_overlap > 0.40:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip text-overlap (%d,%d) %dx%d "
|
||||
"overlap=%.0f%% centroids=%d",
|
||||
bx, by, bw, bh, word_overlap * 100, word_centroid_count,
|
||||
)
|
||||
continue
|
||||
|
||||
# (b) Multiple OCR words detected inside → colored text
|
||||
# (images rarely produce 2+ confident word detections)
|
||||
if word_centroid_count >= 2:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip multi-word (%d,%d) %dx%d "
|
||||
"centroids=%d overlap=%.0f%% density=%.0f%%",
|
||||
bx, by, bw, bh, word_centroid_count,
|
||||
word_overlap * 100, density * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# (c) Even 1 word + some pixel overlap → likely text
|
||||
if word_centroid_count >= 1 and word_overlap > 0.10:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip word+overlap (%d,%d) %dx%d "
|
||||
"centroids=%d overlap=%.0f%%",
|
||||
bx, by, bw, bh, word_centroid_count, word_overlap * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# Need a minimum number of colored pixels (not just dilated area)
|
||||
if color_pixel_count < 200:
|
||||
continue
|
||||
|
||||
# (d) Very low density → thin strokes, almost certainly text.
|
||||
# Large regions (photos/illustrations) can have low color density
|
||||
# because most pixels are grayscale ink. Use a lower threshold
|
||||
# for regions bigger than 100×80 px.
|
||||
_min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
|
||||
if density < _min_density:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
|
||||
"density=%.0f%% (min=%.0f%%, likely colored text)",
|
||||
bx, by, bw, bh, density * 100, _min_density * 100,
|
||||
)
|
||||
continue
|
||||
|
||||
# (e) Moderate density + small height → colored text line
|
||||
if density < 0.35 and bh < h * 0.05:
|
||||
logger.info(
|
||||
"GraphicDetect PASS1 skip text-height (%d,%d) %dx%d "
|
||||
"density=%.0f%% height=%.1f%%",
|
||||
bx, by, bw, bh, density * 100, 100.0 * bh / h,
|
||||
)
|
||||
continue
|
||||
|
||||
# Determine dominant color from the actual colored pixels
|
||||
roi_hsv = hsv[by:by + bh, bx:bx + bw]
|
||||
color_px_mask = roi_color > 0
|
||||
if np.sum(color_px_mask) > 0:
|
||||
masked_hsv = roi_hsv[color_px_mask]
|
||||
color_name, color_hex = _dominant_color(masked_hsv)
|
||||
else:
|
||||
color_name, color_hex = "black", _COLOR_HEX["black"]
|
||||
|
||||
# Confidence based on color density and low word overlap
|
||||
conf = min(0.95, 0.5 + density * 0.5)
|
||||
|
||||
logger.debug("GraphicDetect PASS1 accept (%d,%d) %dx%d px=%d density=%.0f%% overlap=%.0f%% %s",
|
||||
bx, by, bw, bh, color_pixel_count, density * 100, word_overlap * 100, color_name)
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=color_pixel_count,
|
||||
shape="image",
|
||||
color_name=color_name, color_hex=color_hex,
|
||||
confidence=round(conf, 2), contour=cnt,
|
||||
))
|
||||
|
||||
# =====================================================================
|
||||
# PASS 2 — LARGE BLACK-INK ILLUSTRATIONS
|
||||
# =====================================================================
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
_, dark_mask = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Exclude words and colored regions already found
|
||||
exclusion = np.zeros((h, w), dtype=np.uint8)
|
||||
word_pad = 5
|
||||
for wb in word_boxes:
|
||||
x1 = max(0, int(wb.get("left", 0)) - word_pad)
|
||||
y1 = max(0, int(wb.get("top", 0)) - word_pad)
|
||||
x2 = min(w, int(wb.get("left", 0) + wb.get("width", 0)) + word_pad)
|
||||
y2 = min(h, int(wb.get("top", 0) + wb.get("height", 0)) + word_pad)
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
if detected_boxes:
|
||||
for box in detected_boxes:
|
||||
bbx = int(box.get("x", 0))
|
||||
bby = int(box.get("y", 0))
|
||||
bbw = int(box.get("w", box.get("width", 0)))
|
||||
bbh = int(box.get("h", box.get("height", 0)))
|
||||
inset = 8
|
||||
x1 = max(0, bbx + inset)
|
||||
y1 = max(0, bby + inset)
|
||||
x2 = min(w, bbx + bbw - inset)
|
||||
y2 = min(h, bby + bbh - inset)
|
||||
if x2 > x1 and y2 > y1:
|
||||
exclusion[y1:y2, x1:x2] = 255
|
||||
|
||||
ink_only = cv2.bitwise_and(dark_mask, cv2.bitwise_not(exclusion))
|
||||
ink_only = cv2.bitwise_and(ink_only, cv2.bitwise_not(color_pixels))
|
||||
|
||||
contours_ink, _ = cv2.findContours(
|
||||
ink_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
|
||||
)
|
||||
logger.debug("GraphicDetect PASS2 ink: %d contours", len(contours_ink))
|
||||
|
||||
for cnt in contours_ink:
|
||||
area = cv2.contourArea(cnt)
|
||||
bx, by, bw, bh = cv2.boundingRect(cnt)
|
||||
|
||||
if area < 5000 or min(bw, bh) < 40:
|
||||
continue
|
||||
if bw > w * 0.8 or bh > h * 0.8:
|
||||
continue
|
||||
|
||||
logger.debug("GraphicDetect PASS2 accept (%d,%d) %dx%d area=%d",
|
||||
bx, by, bw, bh, int(area))
|
||||
candidates.append(GraphicElement(
|
||||
x=bx, y=by, width=bw, height=bh,
|
||||
area=int(area), shape="illustration",
|
||||
color_name="black", color_hex="#000000",
|
||||
confidence=0.5, contour=cnt,
|
||||
))
|
||||
|
||||
# =====================================================================
|
||||
# Deduplicate and return
|
||||
# =====================================================================
|
||||
candidates.sort(key=lambda g: g.area, reverse=True)
|
||||
|
||||
final: List[GraphicElement] = []
|
||||
for c in candidates:
|
||||
overlap = False
|
||||
for f in final:
|
||||
ix1 = max(c.x, f.x)
|
||||
iy1 = max(c.y, f.y)
|
||||
ix2 = min(c.x + c.width, f.x + f.width)
|
||||
iy2 = min(c.y + c.height, f.y + f.height)
|
||||
if ix2 > ix1 and iy2 > iy1:
|
||||
inter = (ix2 - ix1) * (iy2 - iy1)
|
||||
smaller = min(c.width * c.height, f.width * f.height)
|
||||
if smaller > 0 and inter / smaller > 0.5:
|
||||
overlap = True
|
||||
break
|
||||
if not overlap:
|
||||
final.append(c)
|
||||
|
||||
result = final[:max_elements]
|
||||
|
||||
if result:
|
||||
shape_counts: Dict[str, int] = {}
|
||||
for g in result:
|
||||
shape_counts[g.shape] = shape_counts.get(g.shape, 0) + 1
|
||||
logger.info(
|
||||
"GraphicDetect: %d elements found (%s)",
|
||||
len(result),
|
||||
", ".join(f"{s}: {c}" for s, c in sorted(shape_counts.items())),
|
||||
)
|
||||
else:
|
||||
logger.info("GraphicDetect: no graphic elements found")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Syllable Core — hyphenator init, word validation, pipe autocorrect.
|
||||
|
||||
Extracted from cv_syllable_detect.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# IPA/phonetic characters -- skip cells containing these
|
||||
_IPA_RE = re.compile(r'[\[\]\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u00e6\u0254\u0259\u025b\u025c\u026a\u028a\u028c]')
|
||||
|
||||
# Common German words that should NOT be merged with adjacent tokens.
|
||||
_STOP_WORDS = frozenset([
|
||||
# Articles
|
||||
'der', 'die', 'das', 'dem', 'den', 'des',
|
||||
'ein', 'eine', 'einem', 'einen', 'einer',
|
||||
# Pronouns
|
||||
'du', 'er', 'es', 'sie', 'wir', 'ihr', 'ich', 'man', 'sich',
|
||||
'dich', 'dir', 'mich', 'mir', 'uns', 'euch', 'ihm', 'ihn',
|
||||
# Prepositions
|
||||
'mit', 'von', 'zu', 'f\u00fcr', 'auf', 'in', 'an', 'um', 'am', 'im',
|
||||
'aus', 'bei', 'nach', 'vor', 'bis', 'durch', '\u00fcber', 'unter',
|
||||
'zwischen', 'ohne', 'gegen',
|
||||
# Conjunctions
|
||||
'und', 'oder', 'als', 'wie', 'wenn', 'dass', 'weil', 'aber',
|
||||
# Adverbs
|
||||
'auch', 'noch', 'nur', 'schon', 'sehr', 'nicht',
|
||||
# Verbs
|
||||
'ist', 'hat', 'wird', 'kann', 'soll', 'muss', 'darf',
|
||||
'sein', 'haben',
|
||||
# Other
|
||||
'kein', 'keine', 'keinem', 'keinen', 'keiner',
|
||||
])
|
||||
|
||||
# Cached hyphenators
|
||||
_hyph_de = None
|
||||
_hyph_en = None
|
||||
|
||||
# Cached spellchecker (for autocorrect_pipe_artifacts)
|
||||
_spell_de = None
|
||||
|
||||
|
||||
def _get_hyphenators():
|
||||
"""Lazy-load pyphen hyphenators (cached across calls)."""
|
||||
global _hyph_de, _hyph_en
|
||||
if _hyph_de is not None:
|
||||
return _hyph_de, _hyph_en
|
||||
try:
|
||||
import pyphen
|
||||
except ImportError:
|
||||
return None, None
|
||||
_hyph_de = pyphen.Pyphen(lang='de_DE')
|
||||
_hyph_en = pyphen.Pyphen(lang='en_US')
|
||||
return _hyph_de, _hyph_en
|
||||
|
||||
|
||||
def _get_spellchecker():
|
||||
"""Lazy-load German spellchecker (cached across calls)."""
|
||||
global _spell_de
|
||||
if _spell_de is not None:
|
||||
return _spell_de
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
except ImportError:
|
||||
return None
|
||||
_spell_de = SpellChecker(language='de')
|
||||
return _spell_de
|
||||
|
||||
|
||||
def _is_known_word(word: str, hyph_de, hyph_en) -> bool:
|
||||
"""Check whether pyphen recognises a word (DE or EN)."""
|
||||
if len(word) < 2:
|
||||
return False
|
||||
return ('|' in hyph_de.inserted(word, hyphen='|')
|
||||
or '|' in hyph_en.inserted(word, hyphen='|'))
|
||||
|
||||
|
||||
def _is_real_word(word: str) -> bool:
|
||||
"""Check whether spellchecker knows this word (case-insensitive)."""
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
return False
|
||||
return word.lower() in spell
|
||||
|
||||
|
||||
def _hyphenate_word(word: str, hyph_de, hyph_en) -> Optional[str]:
|
||||
"""Try to hyphenate a word using DE then EN dictionary.
|
||||
|
||||
Returns word with | separators, or None if not recognized.
|
||||
"""
|
||||
hyph = hyph_de.inserted(word, hyphen='|')
|
||||
if '|' in hyph:
|
||||
return hyph
|
||||
hyph = hyph_en.inserted(word, hyphen='|')
|
||||
if '|' in hyph:
|
||||
return hyph
|
||||
return None
|
||||
|
||||
|
||||
def _autocorrect_piped_word(word_with_pipes: str) -> Optional[str]:
|
||||
"""Try to correct a word that has OCR pipe artifacts.
|
||||
|
||||
Printed syllable divider lines on dictionary pages confuse OCR:
|
||||
the vertical stroke is often read as an extra character (commonly
|
||||
``l``, ``I``, ``1``, ``i``) adjacent to where the pipe appears.
|
||||
|
||||
Uses ``spellchecker`` (frequency-based word list) for validation.
|
||||
|
||||
Strategy:
|
||||
1. Strip ``|`` -- if spellchecker knows the result, done.
|
||||
2. Try deleting each pipe-like character (l, I, 1, i, t).
|
||||
3. Fall back to spellchecker's own ``correction()`` method.
|
||||
4. Preserve the original casing of the first letter.
|
||||
"""
|
||||
stripped = word_with_pipes.replace('|', '')
|
||||
if not stripped or len(stripped) < 3:
|
||||
return stripped # too short to validate
|
||||
|
||||
# Step 1: if the stripped word is already a real word, done
|
||||
if _is_real_word(stripped):
|
||||
return stripped
|
||||
|
||||
# Step 2: try deleting pipe-like characters (most likely artifacts)
|
||||
_PIPE_LIKE = frozenset('lI1it')
|
||||
for idx in range(len(stripped)):
|
||||
if stripped[idx] not in _PIPE_LIKE:
|
||||
continue
|
||||
candidate = stripped[:idx] + stripped[idx + 1:]
|
||||
if len(candidate) >= 3 and _is_real_word(candidate):
|
||||
return candidate
|
||||
|
||||
# Step 3: use spellchecker's built-in correction
|
||||
spell = _get_spellchecker()
|
||||
if spell is not None:
|
||||
suggestion = spell.correction(stripped.lower())
|
||||
if suggestion and suggestion != stripped.lower():
|
||||
# Preserve original first-letter case
|
||||
if stripped[0].isupper():
|
||||
suggestion = suggestion[0].upper() + suggestion[1:]
|
||||
return suggestion
|
||||
|
||||
return None # could not fix
|
||||
|
||||
|
||||
def autocorrect_pipe_artifacts(
|
||||
zones_data: List[Dict], session_id: str,
|
||||
) -> int:
|
||||
"""Strip OCR pipe artifacts and correct garbled words in-place.
|
||||
|
||||
Printed syllable divider lines on dictionary scans are read by OCR
|
||||
as ``|`` characters embedded in words (e.g. ``Zel|le``, ``Ze|plpe|lin``).
|
||||
This function:
|
||||
|
||||
1. Strips ``|`` from every word in content cells.
|
||||
2. Validates with spellchecker (real dictionary lookup).
|
||||
3. If not recognised, tries deleting pipe-like characters or uses
|
||||
spellchecker's correction (e.g. ``Zeplpelin`` -> ``Zeppelin``).
|
||||
4. Updates both word-box texts and cell text.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
spell = _get_spellchecker()
|
||||
if spell is None:
|
||||
logger.warning("spellchecker not available -- pipe autocorrect limited")
|
||||
# Fall back: still strip pipes even without spellchecker
|
||||
pass
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
cell_changed = False
|
||||
|
||||
# --- Fix word boxes ---
|
||||
for wb in cell.get("word_boxes", []):
|
||||
wb_text = wb.get("text", "")
|
||||
if "|" not in wb_text:
|
||||
continue
|
||||
|
||||
# Separate trailing punctuation
|
||||
m = re.match(
|
||||
r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)'
|
||||
r'(.*?)'
|
||||
r'([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$',
|
||||
wb_text,
|
||||
)
|
||||
if not m:
|
||||
continue
|
||||
lead, core, trail = m.group(1), m.group(2), m.group(3)
|
||||
if "|" not in core:
|
||||
continue
|
||||
|
||||
corrected = _autocorrect_piped_word(core)
|
||||
if corrected is not None and corrected != core:
|
||||
wb["text"] = lead + corrected + trail
|
||||
cell_changed = True
|
||||
|
||||
# --- Rebuild cell text from word boxes ---
|
||||
if cell_changed:
|
||||
wbs = cell.get("word_boxes", [])
|
||||
if wbs:
|
||||
cell["text"] = " ".join(
|
||||
(wb.get("text") or "") for wb in wbs
|
||||
)
|
||||
modified += 1
|
||||
|
||||
# --- Fallback: strip residual | from cell text ---
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
clean = text.replace("|", "")
|
||||
if clean != text:
|
||||
cell["text"] = clean
|
||||
if not cell_changed:
|
||||
modified += 1
|
||||
|
||||
if modified:
|
||||
logger.info(
|
||||
"build-grid session %s: autocorrected pipe artifacts in %d cells",
|
||||
session_id, modified,
|
||||
)
|
||||
return modified
|
||||
@@ -0,0 +1,32 @@
|
||||
"""
|
||||
Syllable divider insertion for dictionary pages — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
cv_syllable_core — hyphenator init, word validation, pipe autocorrect
|
||||
cv_syllable_merge — word gap merging, syllabification, divider insertion
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
# Core: init, validation, autocorrect
|
||||
from cv_syllable_core import ( # noqa: F401
|
||||
_IPA_RE,
|
||||
_STOP_WORDS,
|
||||
_get_hyphenators,
|
||||
_get_spellchecker,
|
||||
_is_known_word,
|
||||
_is_real_word,
|
||||
_hyphenate_word,
|
||||
_autocorrect_piped_word,
|
||||
autocorrect_pipe_artifacts,
|
||||
)
|
||||
|
||||
# Merge: gap merging, syllabify, insert
|
||||
from cv_syllable_merge import ( # noqa: F401
|
||||
_try_merge_pipe_gaps,
|
||||
merge_word_gaps_in_zones,
|
||||
_try_merge_word_gaps,
|
||||
_syllabify_text,
|
||||
insert_syllable_dividers,
|
||||
)
|
||||
@@ -0,0 +1,300 @@
|
||||
"""
|
||||
Syllable Merge — word gap merging, syllabification, divider insertion.
|
||||
|
||||
Extracted from cv_syllable_detect.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_syllable_core import (
|
||||
_get_hyphenators,
|
||||
_hyphenate_word,
|
||||
_IPA_RE,
|
||||
_STOP_WORDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _try_merge_pipe_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge fragments separated by single spaces where OCR split at a pipe.
|
||||
|
||||
Example: "Kaf fee" -> "Kaffee" (pyphen recognizes the merged word).
|
||||
Multi-step: "Ka bel jau" -> "Kabel jau" -> "Kabeljau".
|
||||
|
||||
Guards against false merges:
|
||||
- The FIRST token must be pure alpha (word start -- no attached punctuation)
|
||||
- The second token may have trailing punctuation (comma, period) which
|
||||
stays attached to the merged word: "Ka" + "fer," -> "Kafer,"
|
||||
- Common German function words (der, die, das, ...) are never merged
|
||||
- At least one fragment must be very short (<=3 alpha chars)
|
||||
"""
|
||||
parts = text.split(' ')
|
||||
if len(parts) < 2:
|
||||
return text
|
||||
|
||||
result = [parts[0]]
|
||||
i = 1
|
||||
while i < len(parts):
|
||||
prev = result[-1]
|
||||
curr = parts[i]
|
||||
|
||||
# Extract alpha-only core for lookup
|
||||
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
|
||||
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
|
||||
|
||||
# Guard 1: first token must be pure alpha (word-start fragment)
|
||||
# second token may have trailing punctuation
|
||||
# Guard 2: neither alpha core can be a common German function word
|
||||
# Guard 3: the shorter fragment must be <= 3 chars (pipe-gap signal)
|
||||
# Guard 4: combined length must be >= 4
|
||||
should_try = (
|
||||
prev == prev_alpha # first token: pure alpha (word start)
|
||||
and prev_alpha and curr_alpha
|
||||
and prev_alpha.lower() not in _STOP_WORDS
|
||||
and curr_alpha.lower() not in _STOP_WORDS
|
||||
and min(len(prev_alpha), len(curr_alpha)) <= 3
|
||||
and len(prev_alpha) + len(curr_alpha) >= 4
|
||||
)
|
||||
|
||||
if should_try:
|
||||
merged_alpha = prev_alpha + curr_alpha
|
||||
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||||
if '-' in hyph:
|
||||
# pyphen recognizes merged word -- collapse the space
|
||||
result[-1] = prev + curr
|
||||
i += 1
|
||||
continue
|
||||
|
||||
result.append(curr)
|
||||
i += 1
|
||||
|
||||
return ' '.join(result)
|
||||
|
||||
|
||||
def merge_word_gaps_in_zones(zones_data: List[Dict], session_id: str) -> int:
|
||||
"""Merge OCR word-gap fragments in cell texts using pyphen validation.
|
||||
|
||||
OCR often splits words at syllable boundaries into separate word_boxes,
|
||||
producing text like "zerknit tert" instead of "zerknittert". This
|
||||
function tries to merge adjacent fragments in every content cell.
|
||||
|
||||
More permissive than ``_try_merge_pipe_gaps`` (threshold 5 instead of 3)
|
||||
but still guarded by pyphen dictionary lookup and stop-word exclusion.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, _ = _get_hyphenators()
|
||||
if hyph_de is None:
|
||||
return 0
|
||||
|
||||
modified = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text or " " not in text:
|
||||
continue
|
||||
|
||||
# Skip IPA cells
|
||||
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||||
if _IPA_RE.search(text_no_brackets):
|
||||
continue
|
||||
|
||||
new_text = _try_merge_word_gaps(text, hyph_de)
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
modified += 1
|
||||
|
||||
if modified:
|
||||
logger.info(
|
||||
"build-grid session %s: merged word gaps in %d cells",
|
||||
session_id, modified,
|
||||
)
|
||||
return modified
|
||||
|
||||
|
||||
def _try_merge_word_gaps(text: str, hyph_de) -> str:
|
||||
"""Merge OCR word fragments with relaxed threshold (max_short=5).
|
||||
|
||||
Similar to ``_try_merge_pipe_gaps`` but allows slightly longer fragments
|
||||
(max_short=5 instead of 3). Still requires pyphen to recognize the
|
||||
merged word.
|
||||
"""
|
||||
parts = text.split(' ')
|
||||
if len(parts) < 2:
|
||||
return text
|
||||
|
||||
result = [parts[0]]
|
||||
i = 1
|
||||
while i < len(parts):
|
||||
prev = result[-1]
|
||||
curr = parts[i]
|
||||
|
||||
prev_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', prev)
|
||||
curr_alpha = re.sub(r'[^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]', '', curr)
|
||||
|
||||
should_try = (
|
||||
prev == prev_alpha
|
||||
and prev_alpha and curr_alpha
|
||||
and prev_alpha.lower() not in _STOP_WORDS
|
||||
and curr_alpha.lower() not in _STOP_WORDS
|
||||
and min(len(prev_alpha), len(curr_alpha)) <= 5
|
||||
and len(prev_alpha) + len(curr_alpha) >= 4
|
||||
)
|
||||
|
||||
if should_try:
|
||||
merged_alpha = prev_alpha + curr_alpha
|
||||
hyph = hyph_de.inserted(merged_alpha, hyphen='-')
|
||||
if '-' in hyph:
|
||||
result[-1] = prev + curr
|
||||
i += 1
|
||||
continue
|
||||
|
||||
result.append(curr)
|
||||
i += 1
|
||||
|
||||
return ' '.join(result)
|
||||
|
||||
|
||||
def _syllabify_text(text: str, hyph_de, hyph_en) -> str:
|
||||
"""Syllabify all significant words in a text string.
|
||||
|
||||
1. Strip existing | dividers
|
||||
2. Merge pipe-gap spaces where possible
|
||||
3. Apply pyphen to each word >= 3 alphabetic chars
|
||||
4. Words pyphen doesn't recognize stay as-is (no bad guesses)
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Skip cells that contain IPA transcription characters outside brackets.
|
||||
text_no_brackets = re.sub(r'\[[^\]]*\]', '', text)
|
||||
if _IPA_RE.search(text_no_brackets):
|
||||
return text
|
||||
|
||||
# Phase 1: strip existing pipe dividers for clean normalization
|
||||
clean = text.replace('|', '')
|
||||
|
||||
# Phase 2: merge pipe-gap spaces (OCR fragments from pipe splitting)
|
||||
clean = _try_merge_pipe_gaps(clean, hyph_de)
|
||||
|
||||
# Phase 3: tokenize and syllabify each word
|
||||
# Split on whitespace and comma/semicolon sequences, keeping separators
|
||||
tokens = re.split(r'(\s+|[,;:]+\s*)', clean)
|
||||
|
||||
result = []
|
||||
for tok in tokens:
|
||||
if not tok or re.match(r'^[\s,;:]+$', tok):
|
||||
result.append(tok)
|
||||
continue
|
||||
|
||||
# Strip trailing/leading punctuation for pyphen lookup
|
||||
m = re.match(r'^([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)(.*?)([^a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u1e9e]*)$', tok)
|
||||
if not m:
|
||||
result.append(tok)
|
||||
continue
|
||||
lead, word, trail = m.group(1), m.group(2), m.group(3)
|
||||
|
||||
if len(word) < 3 or not re.search(r'[a-zA-Z\u00e4\u00f6\u00fc\u00c4\u00d6\u00dc\u00df]', word):
|
||||
result.append(tok)
|
||||
continue
|
||||
|
||||
hyph = _hyphenate_word(word, hyph_de, hyph_en)
|
||||
if hyph:
|
||||
result.append(lead + hyph + trail)
|
||||
else:
|
||||
result.append(tok)
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def insert_syllable_dividers(
|
||||
zones_data: List[Dict],
|
||||
img_bgr: np.ndarray,
|
||||
session_id: str,
|
||||
*,
|
||||
force: bool = False,
|
||||
col_filter: Optional[set] = None,
|
||||
) -> int:
|
||||
"""Insert pipe syllable dividers into dictionary cells.
|
||||
|
||||
For dictionary pages: process all content column cells, strip existing
|
||||
pipes, merge pipe-gap spaces, and re-syllabify using pyphen.
|
||||
|
||||
Pre-check: at least 1% of content cells must already contain ``|`` from
|
||||
OCR. This guards against pages with zero pipe characters.
|
||||
|
||||
Args:
|
||||
force: If True, skip the pipe-ratio pre-check and syllabify all
|
||||
content words regardless of whether the original has pipe dividers.
|
||||
col_filter: If set, only process cells whose col_type is in this set.
|
||||
None means process all content columns.
|
||||
|
||||
Returns the number of cells modified.
|
||||
"""
|
||||
hyph_de, hyph_en = _get_hyphenators()
|
||||
if hyph_de is None:
|
||||
logger.warning("pyphen not installed -- skipping syllable insertion")
|
||||
return 0
|
||||
|
||||
# Pre-check: count cells that already have | from OCR.
|
||||
if not force:
|
||||
total_col_cells = 0
|
||||
cells_with_pipes = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type", "").startswith("column_"):
|
||||
total_col_cells += 1
|
||||
if "|" in cell.get("text", ""):
|
||||
cells_with_pipes += 1
|
||||
|
||||
if total_col_cells > 0:
|
||||
pipe_ratio = cells_with_pipes / total_col_cells
|
||||
if pipe_ratio < 0.01:
|
||||
logger.info(
|
||||
"build-grid session %s: skipping syllable insertion -- "
|
||||
"only %.1f%% of cells have existing pipes (need >=1%%)",
|
||||
session_id, pipe_ratio * 100,
|
||||
)
|
||||
return 0
|
||||
|
||||
insertions = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
if col_filter is not None and ct not in col_filter:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# In auto mode (force=False), only normalize cells that already
|
||||
# have | from OCR (i.e. printed syllable dividers on the original
|
||||
# scan). Don't add new syllable marks to other words.
|
||||
if not force and "|" not in text:
|
||||
continue
|
||||
|
||||
new_text = _syllabify_text(text, hyph_de, hyph_en)
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
insertions += 1
|
||||
|
||||
if insertions:
|
||||
logger.info(
|
||||
"build-grid session %s: syllable dividers inserted/normalized "
|
||||
"in %d cells (pyphen)",
|
||||
session_id, insertions,
|
||||
)
|
||||
return insertions
|
||||
@@ -0,0 +1,493 @@
|
||||
"""
|
||||
Cell text filtering, column/row word assignment, and bold detection.
|
||||
|
||||
This module contains:
|
||||
- _assign_row_words_to_columns(): spatial assignment of OCR words to grid columns
|
||||
- Cell text noise filtering (_clean_cell_text, _clean_cell_text_lite, etc.)
|
||||
- Bold detection via stroke-width analysis (_measure_stroke_width, _classify_bold_cells)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Column / Row word assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
row: RowGeometry,
|
||||
columns: List[PageRegion],
|
||||
) -> Dict[int, List[Dict]]:
|
||||
"""Assign each word in a row to exactly one column.
|
||||
|
||||
Uses a two-pass strategy:
|
||||
1. Containment: if a word's center falls within a column's horizontal
|
||||
bounds (with padding), assign it to that column.
|
||||
2. Nearest center: for words not contained by any column, fall back to
|
||||
nearest column center distance.
|
||||
|
||||
This prevents long sentences in wide columns (e.g. example) from having
|
||||
their rightmost words stolen by an adjacent column.
|
||||
|
||||
Args:
|
||||
row: Row with words (relative coordinates).
|
||||
columns: Sorted list of columns (absolute coordinates).
|
||||
|
||||
Returns:
|
||||
Dict mapping col_index -> list of words assigned to that column.
|
||||
"""
|
||||
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
|
||||
|
||||
if not row.words or not columns:
|
||||
return result
|
||||
|
||||
left_x = row.x # content ROI left (absolute)
|
||||
|
||||
# Build non-overlapping column assignment ranges using midpoints.
|
||||
# For adjacent columns, the boundary is the midpoint between them.
|
||||
# This prevents words near column borders from being assigned to
|
||||
# the wrong column (e.g. "We" at the start of an example sentence
|
||||
# being stolen by the preceding DE column).
|
||||
n = len(columns)
|
||||
col_ranges_rel = [] # (assign_left, assign_right) per column
|
||||
for ci, col in enumerate(columns):
|
||||
col_left_rel = col.x - left_x
|
||||
col_right_rel = col_left_rel + col.width
|
||||
|
||||
# Left boundary: midpoint to previous column, or 0
|
||||
if ci == 0:
|
||||
assign_left = 0
|
||||
else:
|
||||
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
|
||||
assign_left = (prev_right + col_left_rel) / 2
|
||||
|
||||
# Right boundary: midpoint to next column, or infinity (row width)
|
||||
if ci == n - 1:
|
||||
assign_right = row.width + 100 # generous for last column
|
||||
else:
|
||||
next_left = columns[ci + 1].x - left_x
|
||||
assign_right = (col_right_rel + next_left) / 2
|
||||
|
||||
col_ranges_rel.append((assign_left, assign_right))
|
||||
|
||||
for w in row.words:
|
||||
w_left = w['left']
|
||||
w_right = w_left + w['width']
|
||||
w_center_x = w_left + w['width'] / 2
|
||||
|
||||
# Primary: overlap-based matching — assign to column with most overlap.
|
||||
# This is more robust than center-based for narrow columns (page_ref)
|
||||
# where the last character's center may fall into the next column.
|
||||
best_col = -1
|
||||
best_overlap = 0
|
||||
for ci, col in enumerate(columns):
|
||||
col_left_rel = col.x - left_x
|
||||
col_right_rel = col_left_rel + col.width
|
||||
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_col = ci
|
||||
|
||||
if best_col >= 0 and best_overlap > 0:
|
||||
result[best_col].append(w)
|
||||
else:
|
||||
# Fallback: center-based range matching
|
||||
assigned = False
|
||||
for ci, (al, ar) in enumerate(col_ranges_rel):
|
||||
if al <= w_center_x < ar:
|
||||
result[ci].append(w)
|
||||
assigned = True
|
||||
break
|
||||
|
||||
if not assigned:
|
||||
# Last resort: nearest column center
|
||||
best_col = 0
|
||||
col_left_0 = columns[0].x - left_x
|
||||
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
|
||||
for ci in range(1, n):
|
||||
col_left = columns[ci].x - left_x
|
||||
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_col = ci
|
||||
result[best_col].append(w)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cell text noise filtering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
|
||||
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
|
||||
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
|
||||
|
||||
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
|
||||
# that do NOT appear here are treated as trailing OCR noise.
|
||||
_COMMON_SHORT_WORDS: set = {
|
||||
# EN 1-2 letter
|
||||
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
|
||||
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
|
||||
'or', 'so', 'to', 'up', 'us', 'we',
|
||||
# EN 3 letter
|
||||
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
|
||||
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
|
||||
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
|
||||
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
|
||||
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
|
||||
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
|
||||
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
|
||||
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
|
||||
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
|
||||
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
|
||||
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
|
||||
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
|
||||
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
|
||||
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
|
||||
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
|
||||
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
|
||||
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
|
||||
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
|
||||
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
|
||||
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
|
||||
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
|
||||
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
|
||||
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
|
||||
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
|
||||
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
|
||||
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
|
||||
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
|
||||
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
|
||||
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
|
||||
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
|
||||
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
|
||||
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
|
||||
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
|
||||
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
|
||||
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
|
||||
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
|
||||
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
|
||||
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
|
||||
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
|
||||
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
|
||||
'zap', 'zip', 'zoo',
|
||||
# DE 2-3 letter
|
||||
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
|
||||
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
|
||||
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
|
||||
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
|
||||
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
|
||||
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
|
||||
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
|
||||
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
|
||||
'wut', 'zum', 'zur',
|
||||
}
|
||||
|
||||
# Known abbreviations found in EN/DE textbooks and dictionaries.
|
||||
# Stored WITHOUT trailing period (the noise filter strips periods).
|
||||
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
|
||||
_KNOWN_ABBREVIATIONS: set = {
|
||||
# EN dictionary meta-words
|
||||
'sth', 'sb', 'smth', 'smb', 'sbd',
|
||||
# EN general
|
||||
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
|
||||
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
|
||||
# EN references / textbook
|
||||
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
|
||||
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
|
||||
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
|
||||
'ans', 'wb', 'tb', 'vocab',
|
||||
# EN parts of speech / grammar
|
||||
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
|
||||
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
|
||||
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
|
||||
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
|
||||
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
|
||||
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
|
||||
'syn', 'ant', 'opp', 'var', 'orig',
|
||||
# EN titles
|
||||
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
|
||||
# EN pronunciation
|
||||
'br', 'am', 'brit', 'amer',
|
||||
# EN units
|
||||
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
|
||||
# DE general
|
||||
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
|
||||
'bes', 'insb', 'insbes', 'bspw', 'ca',
|
||||
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
|
||||
'inkl', 'exkl', 'zzgl', 'abzgl',
|
||||
# DE references
|
||||
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
|
||||
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
|
||||
's', 'sp', 'zit', 'zs', 'vlg',
|
||||
# DE grammar
|
||||
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
|
||||
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
|
||||
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
|
||||
# DE regional
|
||||
'nordd', 'österr', 'schweiz',
|
||||
# Linguistic
|
||||
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
|
||||
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
|
||||
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
|
||||
}
|
||||
|
||||
|
||||
def _is_noise_tail_token(token: str) -> bool:
|
||||
"""Check if a token at the END of cell text is trailing OCR noise.
|
||||
|
||||
Trailing fragments are very common OCR artifacts from image edges,
|
||||
borders, and neighbouring cells. This is more aggressive than a
|
||||
general word filter: any short token that isn't in the dictionary
|
||||
of common EN/DE words is considered noise.
|
||||
|
||||
Examples of noise: "Es)", "3", "ee", "B"
|
||||
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
|
||||
"""
|
||||
t = token.strip()
|
||||
if not t:
|
||||
return True
|
||||
|
||||
# Keep ellipsis
|
||||
if t in ('...', '…'):
|
||||
return False
|
||||
|
||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||||
return False
|
||||
if t.endswith(']'):
|
||||
return False
|
||||
|
||||
# Keep meaningful punctuation tokens used in textbooks
|
||||
# = (definition marker), (= (definition opener), ; (separator)
|
||||
if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'):
|
||||
return False
|
||||
|
||||
# Pure non-alpha -> noise ("3", ")", "|")
|
||||
alpha_chars = _RE_ALPHA.findall(t)
|
||||
if not alpha_chars:
|
||||
return True
|
||||
|
||||
# Extract only alpha characters for dictionary lookup
|
||||
cleaned = ''.join(alpha_chars)
|
||||
|
||||
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
|
||||
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
|
||||
# Strip normal trailing punctuation before checking for internal noise.
|
||||
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." -> "cupcakes"
|
||||
t_check = stripped_punct if stripped_punct else t
|
||||
|
||||
# Check for legitimate punctuation patterns vs. real noise.
|
||||
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
|
||||
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
|
||||
# Noise: "3d", "B|", "x7"
|
||||
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
|
||||
# THEN check if residual contains only alpha characters.
|
||||
t_inner = t_check
|
||||
# Remove all parentheses, hyphens, slashes, and dots — these are normal
|
||||
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
|
||||
# "(zer)brechen", "wir/uns", "e.g."
|
||||
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
|
||||
# Now check: does the inner form still have non-alpha noise?
|
||||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||||
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
|
||||
|
||||
# Long alpha words (4+ chars) without internal noise are likely real
|
||||
if len(cleaned) >= 4 and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Short words: check dictionary (uses only alpha chars)
|
||||
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Default: short or suspicious -> noise
|
||||
return True
|
||||
|
||||
|
||||
def _is_garbage_text(text: str) -> bool:
|
||||
"""Check if entire cell text is OCR garbage from image areas.
|
||||
|
||||
Garbage text = no recognizable dictionary word. Catches
|
||||
"(ci]oeu", "uanoaain." etc.
|
||||
"""
|
||||
words = _RE_REAL_WORD.findall(text)
|
||||
if not words:
|
||||
# Check if any token is a known abbreviation (e.g. "e.g.")
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
|
||||
if alpha_only in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
return True
|
||||
|
||||
for w in words:
|
||||
wl = w.lower()
|
||||
# Known short word or abbreviation -> not garbage
|
||||
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
# Long word (>= 4 chars): check vowel/consonant ratio.
|
||||
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
|
||||
# or "cioeu" has unusual ratios (too many or too few vowels).
|
||||
if len(wl) >= 4:
|
||||
vowels = sum(1 for c in wl if c in 'aeiouäöü')
|
||||
ratio = vowels / len(wl)
|
||||
if 0.15 <= ratio <= 0.65:
|
||||
return False # plausible vowel ratio -> real word
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _clean_cell_text(text: str) -> str:
|
||||
"""Remove OCR noise from cell text. Generic filters:
|
||||
|
||||
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
|
||||
2. If the entire text is garbage (no dictionary word), clear.
|
||||
3. Strip trailing noise tokens from the end of the text.
|
||||
"""
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return ''
|
||||
|
||||
# --- Filter 1: No real word at all ---
|
||||
if not _RE_REAL_WORD.search(stripped):
|
||||
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||||
return ''
|
||||
|
||||
# --- Filter 2: Entire text is garbage ---
|
||||
if _is_garbage_text(stripped):
|
||||
return ''
|
||||
|
||||
# --- Filter 3: Strip trailing noise tokens ---
|
||||
tokens = stripped.split()
|
||||
while tokens and _is_noise_tail_token(tokens[-1]):
|
||||
tokens.pop()
|
||||
if not tokens:
|
||||
return ''
|
||||
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def _clean_cell_text_lite(text: str) -> str:
|
||||
"""Simplified noise filter for cell-first OCR (isolated cell crops).
|
||||
|
||||
Since each cell is OCR'd in isolation (no neighbour content visible),
|
||||
trailing-noise stripping is unnecessary. Only 2 filters remain:
|
||||
|
||||
1. No real alphabetic word (>= 2 letters) and not a known abbreviation -> empty.
|
||||
2. Entire text is garbage (no dictionary word) -> empty.
|
||||
"""
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return ''
|
||||
|
||||
# --- Filter 1: No real word at all ---
|
||||
if not _RE_REAL_WORD.search(stripped):
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||||
return ''
|
||||
|
||||
# --- Filter 2: Entire text is garbage ---
|
||||
if _is_garbage_text(stripped):
|
||||
return ''
|
||||
|
||||
return stripped
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bold detection via stroke-width analysis (relative / page-level)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
|
||||
"""Measure mean stroke width in a binarised cell crop.
|
||||
|
||||
Returns a DPI-normalised value (mean stroke width as % of crop height),
|
||||
or 0.0 if measurement is not possible.
|
||||
"""
|
||||
if gray_crop is None or gray_crop.size == 0:
|
||||
return 0.0
|
||||
h, w = gray_crop.shape[:2]
|
||||
if h < 10 or w < 10:
|
||||
return 0.0
|
||||
|
||||
# Binarise: text = white (255), background = black (0)
|
||||
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
|
||||
if cv2.countNonZero(bw) < 20:
|
||||
return 0.0
|
||||
|
||||
# Distance transform: value at each white pixel = distance to nearest black
|
||||
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
|
||||
|
||||
# Skeleton via morphological thinning
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
|
||||
thin = bw.copy()
|
||||
for _ in range(max(1, min(h, w) // 6)):
|
||||
eroded = cv2.erode(thin, kernel)
|
||||
if cv2.countNonZero(eroded) < 5:
|
||||
break
|
||||
thin = eroded
|
||||
|
||||
skeleton_pts = thin > 0
|
||||
if not np.any(skeleton_pts):
|
||||
return 0.0
|
||||
mean_stroke = float(np.mean(dist[skeleton_pts]))
|
||||
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
|
||||
|
||||
|
||||
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
|
||||
img_w: int, img_h: int) -> None:
|
||||
"""Two-pass bold detection: measure all cells, then compare against median.
|
||||
|
||||
Cells with stroke width > 1.4x the page median are marked as bold.
|
||||
This adapts automatically to font, DPI and scan quality.
|
||||
Modifies cells in-place (sets 'is_bold' key).
|
||||
"""
|
||||
if ocr_img is None:
|
||||
return
|
||||
|
||||
# Pass 1: measure stroke width for every cell with text
|
||||
metrics: List[float] = []
|
||||
cell_strokes: List[float] = []
|
||||
for cell in cells:
|
||||
sw = 0.0
|
||||
if cell.get('text', '').strip():
|
||||
bp = cell['bbox_px']
|
||||
y1 = max(0, bp['y'])
|
||||
y2 = min(img_h, bp['y'] + bp['h'])
|
||||
x1 = max(0, bp['x'])
|
||||
x2 = min(img_w, bp['x'] + bp['w'])
|
||||
if y2 > y1 and x2 > x1:
|
||||
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
|
||||
cell_strokes.append(sw)
|
||||
if sw > 0:
|
||||
metrics.append(sw)
|
||||
|
||||
if len(metrics) < 3:
|
||||
# Too few cells to compare — leave all as non-bold
|
||||
return
|
||||
|
||||
median_sw = float(np.median(metrics))
|
||||
if median_sw <= 0:
|
||||
return
|
||||
|
||||
# Pass 2: cells significantly above median -> bold
|
||||
for cell, sw in zip(cells, cell_strokes):
|
||||
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
|
||||
@@ -0,0 +1,189 @@
|
||||
"""Cell-level IPA phonetic fixes for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_insert_missing_ipa,
|
||||
_replace_phonetics_in_text,
|
||||
_text_has_garbled_ipa,
|
||||
)
|
||||
from cv_ocr_ipa_repair import (
|
||||
_has_non_dict_trailing,
|
||||
_insert_headword_ipa,
|
||||
_strip_post_bracket_garbled,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fix_cell_phonetics(
|
||||
cells: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply IPA phonetic fixes to cell texts for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Processing depends on column type:
|
||||
- column_en: Full processing (replace garbled IPA + strip orphan brackets
|
||||
+ insert missing IPA). Safe because these cells contain only English
|
||||
headwords.
|
||||
- column_text: Light processing (replace garbled IPA ONLY). No orphan
|
||||
bracket stripping (brackets may be German content like "(probieren)")
|
||||
and no IPA insertion (would add tokens and break overlay positioning).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return cells
|
||||
|
||||
ipa_col_types = {'column_en', 'column_text'}
|
||||
replaced = 0
|
||||
|
||||
for cell in cells:
|
||||
col_type = cell.get('col_type', '')
|
||||
if col_type not in ipa_col_types:
|
||||
continue
|
||||
text = cell.get('text', '') or ''
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
if col_type == 'column_en':
|
||||
# Full processing: replace garbled IPA, strip orphan brackets.
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
|
||||
if new_text == text:
|
||||
# Insert IPA when garbled phonetics exist OR when trailing
|
||||
# non-dictionary words suggest garbled IPA in plain ASCII.
|
||||
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
# Strip trailing garbled fragments after proper [IPA] brackets
|
||||
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
|
||||
if ']' in new_text:
|
||||
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert headword IPA ONLY if there's a gap in word_boxes
|
||||
# suggesting Tesseract missed an IPA bracket on the page.
|
||||
# Without gap evidence, the original page had no IPA.
|
||||
if new_text == text:
|
||||
wb = cell.get('word_boxes', [])
|
||||
if _has_ipa_gap(text, wb):
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
cell['text'] = new_text
|
||||
replaced += 1
|
||||
|
||||
if replaced:
|
||||
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
|
||||
return cells
|
||||
|
||||
|
||||
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
|
||||
"""Check if word_boxes show a gap where IPA brackets should be.
|
||||
|
||||
On a typical vocab page, the layout is:
|
||||
headword [ipa] German translation
|
||||
|
||||
If Tesseract missed the IPA bracket, the gap between the headword
|
||||
and the next word (German translation) is unusually large (>80px)
|
||||
because the IPA occupied physical space on the page.
|
||||
|
||||
If no IPA was on the page (e.g. "be good at sth."), the words are
|
||||
close together (<30px).
|
||||
"""
|
||||
if not word_boxes or len(word_boxes) < 2:
|
||||
return False
|
||||
|
||||
tokens = text.split()
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# Find the headword index: skip numeric prefixes like "».55", "0.56"
|
||||
hw_box_idx = 0
|
||||
for i, wb in enumerate(word_boxes):
|
||||
wt = wb.get('text', '')
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
|
||||
if len(clean) >= 2:
|
||||
hw_box_idx = i
|
||||
break
|
||||
|
||||
if hw_box_idx >= len(word_boxes) - 1:
|
||||
return False
|
||||
|
||||
# Check gap between headword and the next word_box
|
||||
hw = word_boxes[hw_box_idx]
|
||||
next_wb = word_boxes[hw_box_idx + 1]
|
||||
gap = next_wb['left'] - (hw['left'] + hw['width'])
|
||||
|
||||
return gap > 80
|
||||
|
||||
|
||||
def _sync_word_boxes_after_ipa_insert(
|
||||
cell: Dict[str, Any],
|
||||
old_text: str,
|
||||
new_text: str,
|
||||
) -> None:
|
||||
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
|
||||
|
||||
E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
|
||||
Adds a new word_box right after the headword's box so the 1:1
|
||||
token-to-box mapping in the frontend overlay stays consistent.
|
||||
"""
|
||||
word_boxes = cell.get('word_boxes')
|
||||
if not word_boxes:
|
||||
return
|
||||
|
||||
old_tokens = old_text.split()
|
||||
new_tokens = new_text.split()
|
||||
|
||||
if len(new_tokens) != len(old_tokens) + 1:
|
||||
return # unexpected change, skip
|
||||
|
||||
# Find the inserted token by walking both lists in parallel.
|
||||
# One token in new_tokens won't match — that's the inserted IPA.
|
||||
insert_idx = -1
|
||||
j = 0 # index into old_tokens
|
||||
for i in range(len(new_tokens)):
|
||||
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
|
||||
j += 1
|
||||
else:
|
||||
insert_idx = i
|
||||
break
|
||||
|
||||
if insert_idx < 0 or insert_idx >= len(new_tokens):
|
||||
return
|
||||
|
||||
ipa_token = new_tokens[insert_idx]
|
||||
|
||||
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
|
||||
ref_idx = insert_idx - 1
|
||||
if ref_idx < 0 or ref_idx >= len(word_boxes):
|
||||
return
|
||||
|
||||
ref_box = word_boxes[ref_idx]
|
||||
ipa_box = {
|
||||
'text': ipa_token,
|
||||
'left': ref_box['left'] + ref_box['width'] + 2,
|
||||
'top': ref_box['top'],
|
||||
'width': ref_box['width'],
|
||||
'height': ref_box['height'],
|
||||
'conf': ref_box.get('conf', 90),
|
||||
}
|
||||
word_boxes.insert(insert_idx, ipa_box)
|
||||
@@ -0,0 +1,381 @@
|
||||
"""
|
||||
OCR engines (RapidOCR, TrOCR, LightOn) and re-exports.
|
||||
|
||||
This module contains the OCR engine wrappers and re-exports all functions
|
||||
from the split sub-modules for backward compatibility.
|
||||
|
||||
Sub-modules:
|
||||
- cv_ocr_word_assembly: Word grouping and text assembly
|
||||
- cv_ocr_vocab_postprocess: Vocabulary postprocessing (char confusion, comma split)
|
||||
- cv_ocr_ipa_lookup: Core IPA lookup and bracket handling
|
||||
- cv_ocr_ipa_repair: Advanced IPA repair (continuation cells, post-bracket cleanup)
|
||||
- cv_ocr_cell_phonetics: Cell-level phonetics for overlay
|
||||
- cv_ocr_cell_filter: Cell text filtering, column assignment, bold detection
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
IPA_AVAILABLE,
|
||||
PageRegion,
|
||||
RowGeometry,
|
||||
_britfone_dict,
|
||||
_ipa_convert_american,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# ── Re-exports from sub-modules (backward compatibility) ──────────────────
|
||||
|
||||
from cv_ocr_word_assembly import ( # noqa: F401
|
||||
_group_words_into_lines,
|
||||
_words_to_reading_order_lines,
|
||||
_rejoin_hyphenated,
|
||||
_words_to_reading_order_text,
|
||||
_words_to_spaced_text,
|
||||
)
|
||||
|
||||
from cv_ocr_vocab_postprocess import ( # noqa: F401
|
||||
_CHAR_CONFUSION_RULES,
|
||||
_DE_INDICATORS_FOR_EN_I,
|
||||
_fix_character_confusion,
|
||||
_is_singular_plural_pair,
|
||||
_split_comma_entries,
|
||||
_split_by_comma,
|
||||
_find_best_vocab_match,
|
||||
_attach_example_sentences,
|
||||
)
|
||||
|
||||
from cv_ocr_ipa_lookup import ( # noqa: F401
|
||||
_PHONETIC_BRACKET_RE,
|
||||
_IPA_CHARS,
|
||||
_MIN_WORD_CONF,
|
||||
_GRAMMAR_BRACKET_WORDS,
|
||||
_lookup_ipa,
|
||||
_fix_phonetic_brackets,
|
||||
_is_grammar_bracket_content,
|
||||
_replace_phonetics_in_text,
|
||||
_text_has_garbled_ipa,
|
||||
_decompose_compound,
|
||||
_insert_missing_ipa,
|
||||
)
|
||||
|
||||
from cv_ocr_ipa_repair import ( # noqa: F401
|
||||
_has_non_dict_trailing,
|
||||
_strip_post_bracket_garbled,
|
||||
fix_ipa_continuation_cell,
|
||||
_insert_headword_ipa,
|
||||
)
|
||||
|
||||
from cv_ocr_cell_phonetics import ( # noqa: F401
|
||||
fix_cell_phonetics,
|
||||
_has_ipa_gap,
|
||||
_sync_word_boxes_after_ipa_insert,
|
||||
)
|
||||
|
||||
from cv_ocr_cell_filter import ( # noqa: F401
|
||||
_RE_REAL_WORD,
|
||||
_RE_ALPHA,
|
||||
_COMMON_SHORT_WORDS,
|
||||
_KNOWN_ABBREVIATIONS,
|
||||
_assign_row_words_to_columns,
|
||||
_is_noise_tail_token,
|
||||
_is_garbage_text,
|
||||
_clean_cell_text,
|
||||
_clean_cell_text_lite,
|
||||
_measure_stroke_width,
|
||||
_classify_bold_cells,
|
||||
)
|
||||
|
||||
|
||||
# ── OCR Engine Wrappers ───────────────────────────────────────────────────
|
||||
|
||||
_rapid_engine = None
|
||||
RAPIDOCR_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from rapidocr import RapidOCR as _RapidOCRClass
|
||||
from rapidocr import LangRec as _LangRec, OCRVersion as _OCRVersion, ModelType as _ModelType
|
||||
RAPIDOCR_AVAILABLE = True
|
||||
logger.info("RapidOCR available — can be used as alternative to Tesseract")
|
||||
except ImportError:
|
||||
logger.info("RapidOCR not installed — using Tesseract only")
|
||||
|
||||
|
||||
def _get_rapid_engine():
|
||||
"""Lazy-init RapidOCR engine with PP-OCRv5 Latin model for German support."""
|
||||
global _rapid_engine
|
||||
if _rapid_engine is None:
|
||||
_rapid_engine = _RapidOCRClass(params={
|
||||
"Rec.lang_type": _LangRec.LATIN,
|
||||
"Rec.model_type": _ModelType.SERVER,
|
||||
"Rec.ocr_version": _OCRVersion.PPOCRV5,
|
||||
"Det.unclip_ratio": 1.3,
|
||||
"Det.box_thresh": 0.4,
|
||||
"Global.log_level": "critical",
|
||||
})
|
||||
logger.info("RapidOCR engine initialized (PP-OCRv5 Latin, unclip_ratio=1.3)")
|
||||
return _rapid_engine
|
||||
|
||||
|
||||
def ocr_region_rapid(
|
||||
img_bgr: np.ndarray,
|
||||
region: PageRegion,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run RapidOCR on a specific region, returning word dicts compatible with Tesseract format."""
|
||||
engine = _get_rapid_engine()
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
result = engine(crop)
|
||||
|
||||
if result is None or result.boxes is None or result.txts is None:
|
||||
return []
|
||||
|
||||
words = []
|
||||
boxes = result.boxes
|
||||
txts = result.txts
|
||||
scores = result.scores
|
||||
|
||||
for i, (box, txt, score) in enumerate(zip(boxes, txts, scores)):
|
||||
if not txt or not txt.strip():
|
||||
continue
|
||||
|
||||
xs = [p[0] for p in box]
|
||||
ys = [p[1] for p in box]
|
||||
left = int(min(xs))
|
||||
top = int(min(ys))
|
||||
w = int(max(xs) - left)
|
||||
h = int(max(ys) - top)
|
||||
|
||||
words.append({
|
||||
'text': txt.strip(),
|
||||
'left': left + region.x,
|
||||
'top': top + region.y,
|
||||
'width': w,
|
||||
'height': h,
|
||||
'conf': int(score * 100),
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def ocr_region_trocr(img_bgr: np.ndarray, region: PageRegion, handwritten: bool = False) -> List[Dict[str, Any]]:
|
||||
"""Run TrOCR on a region. Returns line-level word dicts."""
|
||||
from services.trocr_service import get_trocr_model, _split_into_lines, _check_trocr_available
|
||||
|
||||
if not _check_trocr_available():
|
||||
logger.warning("TrOCR not available, falling back to Tesseract")
|
||||
if region.height > 0 and region.width > 0:
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||||
if ocr_img_crop is not None:
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
return []
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
import torch
|
||||
from PIL import Image as _PILImage
|
||||
|
||||
processor, model = get_trocr_model(handwritten=handwritten)
|
||||
if processor is None or model is None:
|
||||
logger.warning("TrOCR model not loaded, falling back to Tesseract")
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
|
||||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||||
lines = _split_into_lines(pil_crop)
|
||||
if not lines:
|
||||
lines = [pil_crop]
|
||||
|
||||
device = next(model.parameters()).device
|
||||
all_text = []
|
||||
confidences = []
|
||||
for line_img in lines:
|
||||
pixel_values = processor(images=line_img, return_tensors="pt").pixel_values.to(device)
|
||||
with torch.no_grad():
|
||||
generated_ids = model.generate(pixel_values, max_length=128)
|
||||
text_line = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
||||
if text_line:
|
||||
all_text.append(text_line)
|
||||
confidences.append(0.85 if len(text_line) > 3 else 0.5)
|
||||
|
||||
if not all_text:
|
||||
return []
|
||||
|
||||
avg_conf = int(sum(confidences) / len(confidences) * 100)
|
||||
line_h = region.height // max(len(all_text), 1)
|
||||
words = []
|
||||
for i, line in enumerate(all_text):
|
||||
words.append({
|
||||
"text": line,
|
||||
"left": region.x,
|
||||
"top": region.y + i * line_h,
|
||||
"width": region.width,
|
||||
"height": line_h,
|
||||
"conf": avg_conf,
|
||||
"region_type": region.type,
|
||||
})
|
||||
return words
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ocr_region_trocr failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def ocr_region_lighton(img_bgr: np.ndarray, region: PageRegion) -> List[Dict[str, Any]]:
|
||||
"""Run LightOnOCR-2-1B on a region. Returns line-level word dicts."""
|
||||
from services.lighton_ocr_service import get_lighton_model, _check_lighton_available
|
||||
|
||||
if not _check_lighton_available():
|
||||
logger.warning("LightOnOCR not available, falling back to RapidOCR/Tesseract")
|
||||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||||
return ocr_region_rapid(img_bgr, region)
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) if img_bgr is not None else None
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6) if ocr_img_crop is not None else []
|
||||
|
||||
crop = img_bgr[region.y:region.y + region.height, region.x:region.x + region.width]
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
try:
|
||||
import io
|
||||
import torch
|
||||
from PIL import Image as _PILImage
|
||||
|
||||
processor, model = get_lighton_model()
|
||||
if processor is None or model is None:
|
||||
logger.warning("LightOnOCR model not loaded, falling back to RapidOCR/Tesseract")
|
||||
if RAPIDOCR_AVAILABLE and img_bgr is not None:
|
||||
return ocr_region_rapid(img_bgr, region)
|
||||
ocr_img_crop = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
return ocr_region(ocr_img_crop, region, lang="eng+deu", psm=6)
|
||||
|
||||
pil_crop = _PILImage.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
|
||||
conversation = [{"role": "user", "content": [{"type": "image"}]}]
|
||||
inputs = processor.apply_chat_template(
|
||||
conversation, images=[pil_crop],
|
||||
add_generation_prompt=True, return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
output_ids = model.generate(**inputs, max_new_tokens=1024)
|
||||
|
||||
text = processor.decode(output_ids[0], skip_special_tokens=True).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
||||
line_h = region.height // max(len(lines), 1)
|
||||
words = []
|
||||
for i, line in enumerate(lines):
|
||||
words.append({
|
||||
"text": line,
|
||||
"left": region.x,
|
||||
"top": region.y + i * line_h,
|
||||
"width": region.width,
|
||||
"height": line_h,
|
||||
"conf": 85,
|
||||
"region_type": region.type,
|
||||
})
|
||||
return words
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ocr_region_lighton failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
async def ocr_region_paddle(
|
||||
img_bgr: np.ndarray,
|
||||
region: Optional["PageRegion"] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Run OCR via local RapidOCR (default) or remote PaddleOCR (fallback)."""
|
||||
force_remote = os.environ.get("FORCE_REMOTE_PADDLE", "").strip() == "1"
|
||||
|
||||
if not force_remote:
|
||||
try:
|
||||
if region is None:
|
||||
h, w = img_bgr.shape[:2]
|
||||
_region = PageRegion(type="full_page", x=0, y=0, width=w, height=h)
|
||||
else:
|
||||
_region = region
|
||||
|
||||
words = ocr_region_rapid(img_bgr, _region)
|
||||
if words:
|
||||
logger.info("ocr_region_paddle: used local RapidOCR (%d words)", len(words))
|
||||
return words
|
||||
logger.warning("ocr_region_paddle: RapidOCR returned 0 words, trying remote")
|
||||
except Exception as e:
|
||||
logger.warning("ocr_region_paddle: RapidOCR failed (%s), trying remote", e)
|
||||
|
||||
from services.paddleocr_remote import ocr_remote_paddle
|
||||
|
||||
if region is not None:
|
||||
crop = img_bgr[
|
||||
region.y : region.y + region.height,
|
||||
region.x : region.x + region.width,
|
||||
]
|
||||
offset_x, offset_y = region.x, region.y
|
||||
else:
|
||||
crop = img_bgr
|
||||
offset_x, offset_y = 0, 0
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
h, w = crop.shape[:2]
|
||||
scale = 1.0
|
||||
_MAX_DIM = 1500
|
||||
if max(h, w) > _MAX_DIM:
|
||||
scale = _MAX_DIM / max(h, w)
|
||||
new_w, new_h = int(w * scale), int(h * scale)
|
||||
crop = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||
logger.info("ocr_region_paddle: downscaled %dx%d → %dx%d (scale=%.2f)",
|
||||
w, h, new_w, new_h, scale)
|
||||
|
||||
success, jpg_buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 90])
|
||||
if not success:
|
||||
logger.error("ocr_region_paddle: cv2.imencode failed")
|
||||
return []
|
||||
|
||||
words, _w, _h = await ocr_remote_paddle(jpg_buf.tobytes(), filename="scan.jpg")
|
||||
logger.info("ocr_region_paddle: used remote PaddleOCR (%d words)", len(words))
|
||||
|
||||
inv_scale = 1.0 / scale if scale != 1.0 else 1.0
|
||||
for wd in words:
|
||||
wd["left"] = int(wd["left"] * inv_scale) + offset_x
|
||||
wd["top"] = int(wd["top"] * inv_scale) + offset_y
|
||||
wd["width"] = int(wd["width"] * inv_scale)
|
||||
wd["height"] = int(wd["height"] * inv_scale)
|
||||
if region is not None:
|
||||
wd["region_type"] = region.type
|
||||
|
||||
return words
|
||||
@@ -0,0 +1,476 @@
|
||||
"""
|
||||
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
|
||||
|
||||
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
|
||||
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
|
||||
provides functions to:
|
||||
|
||||
- Look up correct IPA pronunciations (British/American) for English words.
|
||||
- Detect and replace garbled phonetic brackets with dictionary IPA.
|
||||
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
|
||||
- Strip orphan brackets and post-bracket garbled fragments.
|
||||
- Handle IPA continuation cells (phonetics on a separate row from headword).
|
||||
|
||||
All IPA data comes from open-source dictionaries:
|
||||
- Britfone (MIT) for British English
|
||||
- eng_to_ipa / CMU (MIT) for American English
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
IPA_AVAILABLE,
|
||||
_britfone_dict,
|
||||
_ipa_convert_american,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --- D. Phonetic Bracket IPA Replacement ---
|
||||
|
||||
# Pattern: word followed by any bracket type containing phonetic content.
|
||||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||
# This intentionally matches mixed brackets (e.g. {content]) because
|
||||
# Tesseract frequently misrecognizes bracket characters.
|
||||
_PHONETIC_BRACKET_RE = re.compile(
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||
)
|
||||
|
||||
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
|
||||
# lookup) from garbled OCR content when stripping orphan brackets.
|
||||
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
|
||||
|
||||
# Minimum word confidence for full-page Tesseract results (0-100).
|
||||
# Words below this threshold are OCR noise (scanner shadows, borders).
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||||
|
||||
Args:
|
||||
word: English word to look up.
|
||||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||||
|
||||
Returns:
|
||||
IPA string or None if not found.
|
||||
"""
|
||||
word_lower = word.lower().strip()
|
||||
if not word_lower:
|
||||
return None
|
||||
|
||||
if pronunciation == 'british' and _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
# Fallback to American if not in Britfone
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
return None
|
||||
|
||||
if pronunciation == 'american' and _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
# Fallback to Britfone if not in CMU
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
return None
|
||||
|
||||
# Try any available source
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _fix_phonetic_brackets(
|
||||
entries: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||||
|
||||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||||
|
||||
Only replaces if the word before brackets is found in the dictionary.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||
# German and example fields contain meaningful parenthetical content:
|
||||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||
# These must NEVER be processed as phonetic transcriptions.
|
||||
replaced_count = 0
|
||||
for entry in entries:
|
||||
text = entry.get('english', '') or ''
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
entry['english'] = new_text
|
||||
|
||||
if replaced_count:
|
||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
|
||||
return entries
|
||||
|
||||
|
||||
# Grammar particles that appear in brackets after English words:
|
||||
# cross (with), complain (about/of), agree (on/with), look (sth) up
|
||||
# These must NOT be replaced with IPA. Only used for the English field
|
||||
# (German/example fields are never processed for IPA replacement).
|
||||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
# English prepositions/particles commonly in vocab tables
|
||||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||
# English grammar abbreviations used in vocab tables
|
||||
'sth', 'sb', 'adj', 'adv',
|
||||
# Number/plural/grammar annotations
|
||||
'pl', 'sg', 'sing', 'no', 'also', 'auch',
|
||||
# Regional English markers
|
||||
'ae', 'be', 'ame', 'bre',
|
||||
})
|
||||
|
||||
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||
|
||||
Since we only process the English field, we only need to recognize
|
||||
English grammar particles. Everything else is (garbled) IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
# Split on / and spaces for patterns like (about/of), (no pl)
|
||||
tokens = re.split(r'[/\s]+', content.strip().lower())
|
||||
tokens = [t for t in tokens if t]
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# ALL tokens must be known grammar words
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(
|
||||
text: str,
|
||||
pronunciation: str = 'british',
|
||||
strip_orphans: bool = True,
|
||||
) -> str:
|
||||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||
|
||||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||
We match any bracket type and replace with dictionary IPA if found.
|
||||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||
|
||||
Args:
|
||||
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
|
||||
Set to False for column_text where brackets may be German content.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
|
||||
def replacer(match):
|
||||
word = match.group(1)
|
||||
bracket_content = match.group(2).strip()
|
||||
full_match = match.group(0)
|
||||
|
||||
# Skip if bracket content looks like regular text (multiple words)
|
||||
if len(bracket_content.split()) > 3:
|
||||
return full_match
|
||||
|
||||
# Look up IPA for the word before brackets
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
|
||||
if ipa:
|
||||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||||
# Exception: grammar particles like cross (with) — keep those.
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
return full_match
|
||||
logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
# No IPA for this word — keep as-is
|
||||
return full_match
|
||||
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
if strip_orphans:
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||||
# — they never contain a real word ≥4 letters with proper casing.
|
||||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||||
if len(content_alpha) >= 4:
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _text_has_garbled_ipa(text: str) -> bool:
|
||||
"""Check if text contains garbled IPA-like fragments from OCR.
|
||||
|
||||
Returns True if there is evidence of OCR-mangled phonetic
|
||||
transcription, e.g. stress marks, length marks, or IPA special chars.
|
||||
This is used to decide whether ``_insert_missing_ipa`` should run:
|
||||
it must only insert IPA to *replace* garbled phonetics that are already
|
||||
in the text — never to ADD phonetics where none existed on the page.
|
||||
"""
|
||||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||||
stripped = text.strip()
|
||||
if stripped.startswith('[') and stripped.endswith(']'):
|
||||
inner = stripped[1:-1]
|
||||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||||
# use parentheses, not square brackets. Square brackets with
|
||||
# no IPA chars are garbled phonetics.
|
||||
return True
|
||||
|
||||
for w in text.strip().split():
|
||||
# Skip delimiters and very short tokens
|
||||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
continue
|
||||
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
|
||||
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
|
||||
return True
|
||||
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
|
||||
return True
|
||||
# Contains IPA length mark ':' in a short non-word fragment
|
||||
if ':' in w and len(w) < 12:
|
||||
# But not things like "3:00" (time) or common words
|
||||
stripped = re.sub(r'[^a-zA-Z:]', '', w)
|
||||
if ':' in stripped and not stripped.replace(':', '').isalpha():
|
||||
continue
|
||||
return True
|
||||
# Contains IPA special characters
|
||||
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
return True
|
||||
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
|
||||
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
|
||||
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
|
||||
# chars to avoid contractions (don't, won't, o'clock).
|
||||
if "'" in w and not w.startswith("'"):
|
||||
apos_idx = w.index("'")
|
||||
after = w[apos_idx + 1:]
|
||||
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Try to decompose a compound word and concatenate IPA for each part.
|
||||
|
||||
E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
|
||||
Only returns IPA if ALL parts are found in the dictionary.
|
||||
|
||||
Tries splits at every position (min 3 chars per part) and picks the
|
||||
split where the first part is longest.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return None
|
||||
lower = word.lower().strip()
|
||||
if len(lower) < 6:
|
||||
return None # too short for a compound
|
||||
|
||||
best_ipa = None
|
||||
best_first_len = 0
|
||||
|
||||
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
|
||||
first = lower[:split_pos]
|
||||
second = lower[split_pos:]
|
||||
ipa_first = _lookup_ipa(first, pronunciation)
|
||||
ipa_second = _lookup_ipa(second, pronunciation)
|
||||
if ipa_first and ipa_second:
|
||||
if split_pos > best_first_len:
|
||||
best_first_len = split_pos
|
||||
best_ipa = ipa_first + ipa_second
|
||||
|
||||
return best_ipa
|
||||
|
||||
|
||||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||||
|
||||
OCR sometimes garbles the phonetic transcription into plain-text fragments
|
||||
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
|
||||
for the headword, inserts correct [IPA], and strips the garbled fragments.
|
||||
|
||||
Only inserts for words that:
|
||||
- are standalone (not already followed by a bracket)
|
||||
- have an IPA entry in the dictionary
|
||||
- appear to be English headwords (at the start of text or after common
|
||||
separators like ",", ";", "•")
|
||||
|
||||
This is intentionally conservative: it only inserts at the END of each
|
||||
whitespace-separated token group to avoid breaking phrases.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
# Skip if already has brackets (IPA replacement handles those)
|
||||
if any(ch in text for ch in '[{('):
|
||||
return text
|
||||
|
||||
# Only process short text fragments (typical vocab cells).
|
||||
# Long sentences / paragraphs should not get IPA insertions.
|
||||
words = text.strip().split()
|
||||
if len(words) > 6:
|
||||
return text
|
||||
|
||||
# Try to insert IPA for the first alphanumeric word
|
||||
# Typical patterns: "challenge", "profit", "film", "badge"
|
||||
for i, w in enumerate(words):
|
||||
# Clean punctuation for lookup
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
# Skip German/grammar words
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback 0b: compound word decomposition
|
||||
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
|
||||
if not ipa:
|
||||
ipa = _decompose_compound(clean, pronunciation)
|
||||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||||
# backwards ≤3 chars for the onset consonant cluster, and
|
||||
# split into headword + OCR IPA.
|
||||
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
if not ipa:
|
||||
first_marker = next(
|
||||
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
|
||||
)
|
||||
if first_marker >= 3:
|
||||
split = first_marker
|
||||
while (split > 0
|
||||
and split > first_marker - 3
|
||||
and w[split - 1].isalpha()
|
||||
and w[split - 1].islower()):
|
||||
split -= 1
|
||||
if split >= 2:
|
||||
headword = w[:split]
|
||||
ocr_ipa = w[split:]
|
||||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||||
if not hw_ipa:
|
||||
# Try compound decomposition for the headword part
|
||||
hw_ipa = _decompose_compound(headword, pronunciation)
|
||||
if hw_ipa:
|
||||
words[i] = f"{headword} [{hw_ipa}]"
|
||||
else:
|
||||
# Word not in dictionary — use OCR IPA
|
||||
words[i] = f"{headword} [{ocr_ipa}]"
|
||||
words = words[:i + 1]
|
||||
ipa = True # signal that we handled it
|
||||
break
|
||||
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
|
||||
# markers (e.g. "Scotland'skotland"). Find longest dictionary
|
||||
# prefix using only alpha chars to avoid punctuation matches.
|
||||
if not ipa:
|
||||
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||||
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||||
for end in range(len(alpha), 3, -1): # min prefix 4 chars
|
||||
prefix = alpha[:end]
|
||||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||||
if test_ipa:
|
||||
ipa = test_ipa
|
||||
w = prefix
|
||||
words[i] = prefix
|
||||
break
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
# Strip garbled OCR phonetics after the IPA bracket.
|
||||
# On scanned vocab pages, printed IPA is read as garbled
|
||||
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
|
||||
# After inserting correct IPA, remove remaining words that
|
||||
# aren't real English words, delimiters, or German text.
|
||||
kept = words[:i + 1]
|
||||
for j in range(i + 1, len(words)):
|
||||
wj = words[j]
|
||||
# Delimiter — keep this and everything after
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Starts with uppercase — likely German or proper noun
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Known English word (≥2 chars) — keep it and rest
|
||||
if clean_j and len(clean_j) >= 2:
|
||||
if _lookup_ipa(clean_j, pronunciation):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Merged token: dictionary word + garbled IPA stuck together.
|
||||
# E.g. "fictionsalans'fIkfn" starts with "fiction".
|
||||
# Extract the dictionary prefix (≥4 chars) and add it with
|
||||
# IPA, but only if enough chars remain after the prefix (≥3)
|
||||
# to look like garbled IPA, not just a plural 's'.
|
||||
if clean_j and len(clean_j) >= 7:
|
||||
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
|
||||
prefix_j = clean_j[:pend]
|
||||
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
|
||||
if prefix_ipa:
|
||||
kept.append(f"{prefix_j} [{prefix_ipa}]")
|
||||
break
|
||||
break # rest of this token is garbled
|
||||
# Otherwise — likely garbled phonetics, skip
|
||||
words = kept
|
||||
break
|
||||
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
Advanced IPA repair for OCR-extracted vocabulary.
|
||||
|
||||
Functions that detect and fix garbled IPA fragments trailing after
|
||||
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
|
||||
to stay within the 500 LOC budget.
|
||||
|
||||
Contains:
|
||||
- _has_non_dict_trailing: detect non-dictionary trailing words
|
||||
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
|
||||
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
|
||||
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_lookup_ipa,
|
||||
_GRAMMAR_BRACKET_WORDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
||||
"""Check if text has a headword followed by non-dictionary trailing words.
|
||||
|
||||
Used as an additional trigger for ``_insert_missing_ipa`` when
|
||||
``_text_has_garbled_ipa`` returns False because the garbled IPA
|
||||
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return False
|
||||
words = text.strip().split()
|
||||
if len(words) < 2 or len(words) > 6:
|
||||
return False
|
||||
# Find first dictionary word
|
||||
hw_idx = -1
|
||||
for i, w in enumerate(words):
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
if _lookup_ipa(clean, pronunciation):
|
||||
hw_idx = i
|
||||
break
|
||||
if hw_idx < 0 or hw_idx >= len(words) - 1:
|
||||
return False
|
||||
# Check ALL remaining words — if none are dictionary/delimiter/German,
|
||||
# they are likely garbled IPA.
|
||||
for j in range(hw_idx + 1, len(words)):
|
||||
wj = words[j]
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
return False
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
return False
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
return False
|
||||
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _strip_post_bracket_garbled(
|
||||
text: str, pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
|
||||
|
||||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||||
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
|
||||
|
||||
For multi-word headwords like "seat belt", a real English word ("belt")
|
||||
may be followed by garbled IPA duplicates. We detect this by checking
|
||||
whether the sequence after a real word contains IPA markers (`:`, `ə`,
|
||||
etc.) — if so, everything from the first garbled token onward is stripped.
|
||||
"""
|
||||
if ']' not in text:
|
||||
return text
|
||||
last_bracket = text.rfind(']')
|
||||
if last_bracket >= len(text) - 1:
|
||||
return text
|
||||
before = text[:last_bracket + 1].rstrip()
|
||||
after = text[last_bracket + 1:].strip()
|
||||
if not after:
|
||||
return text
|
||||
|
||||
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
after_words = after.split()
|
||||
kept: List[str] = []
|
||||
for idx, w in enumerate(after_words):
|
||||
# Delimiter — keep rest
|
||||
if w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||||
if any(c in w for c in _IPA_MARKER_CHARS):
|
||||
# Everything from here is garbled IPA — stop scanning
|
||||
# but look ahead: if any remaining words are real English
|
||||
# words WITHOUT IPA markers, they might be a different headword
|
||||
# following. Only skip the contiguous garbled run.
|
||||
continue
|
||||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||||
# Uppercase — likely German, keep rest
|
||||
if clean and clean[0].isupper():
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Known English word — keep it, but check if followed by garbled IPA
|
||||
# (multi-word headword case like "seat [siːt] belt si:t belt")
|
||||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||||
# Peek ahead: if next word has IPA markers, the rest is garbled
|
||||
remaining = after_words[idx + 1:]
|
||||
has_garbled_after = any(
|
||||
any(c in rw for c in _IPA_MARKER_CHARS)
|
||||
for rw in remaining
|
||||
)
|
||||
if has_garbled_after:
|
||||
# Keep this real word but stop — rest is garbled duplication
|
||||
kept.append(w)
|
||||
# Still scan for delimiters/German in the remaining words
|
||||
for ridx, rw in enumerate(remaining):
|
||||
if rw in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
rclean = re.sub(r'[^a-zA-Z]', '', rw)
|
||||
if rclean and rclean[0].isupper():
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
break
|
||||
else:
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Unknown short word — likely garbled, skip
|
||||
if kept:
|
||||
return before + ' ' + ' '.join(kept)
|
||||
return before
|
||||
|
||||
|
||||
def fix_ipa_continuation_cell(
|
||||
garbled_text: str,
|
||||
headword_text: str,
|
||||
pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Replace garbled IPA in a continuation row with proper IPA.
|
||||
|
||||
Continuation rows appear below the headword and contain only the
|
||||
printed phonetic transcription, which OCR garbles into fragments
|
||||
like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
|
||||
|
||||
Args:
|
||||
garbled_text: The OCR-garbled IPA text from the continuation row.
|
||||
headword_text: The headword text from the previous row
|
||||
(e.g. ``scarf – scarves``).
|
||||
pronunciation: ``'british'`` or ``'american'``.
|
||||
|
||||
Returns:
|
||||
Corrected IPA text, or the original if no fix could be applied.
|
||||
"""
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||||
# only generate continuation IPA for words NOT already covered.
|
||||
covered_words: set = set()
|
||||
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||||
if has_inline_ipa:
|
||||
# Words before the first bracket already have their IPA shown
|
||||
first_bracket = headword_text.index('[')
|
||||
pre_bracket = headword_text[:first_bracket].strip()
|
||||
for w in pre_bracket.split():
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||||
if clean and len(clean) >= 2:
|
||||
covered_words.add(clean)
|
||||
|
||||
last_bracket_end = headword_text.rfind(']')
|
||||
tail = headword_text[last_bracket_end + 1:].strip()
|
||||
|
||||
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||||
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||||
# — return the inline IPA directly (continuation duplicates it)
|
||||
last_bracket_start = headword_text.rfind('[')
|
||||
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||||
return inline_ipa
|
||||
|
||||
# Only the tail words need continuation IPA
|
||||
headword_text = tail
|
||||
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||||
if not clean_hw:
|
||||
return garbled_text
|
||||
|
||||
# Split headword by delimiters (– — -)
|
||||
# "scarf – scarves" → ["scarf", "scarves"]
|
||||
# "see - saw - seen" → ["see", "saw", "seen"]
|
||||
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
|
||||
parts = [p.strip() for p in parts if p.strip()]
|
||||
|
||||
if not parts:
|
||||
return garbled_text
|
||||
|
||||
# Look up IPA for each headword part.
|
||||
# Skip articles (the, a, an) — they never get IPA in vocab books.
|
||||
# Other function words like "down", "up" are kept because they are
|
||||
# integral parts of phrasal verbs (e.g. "close down").
|
||||
# Skip words that already have inline IPA in the headword row.
|
||||
_ARTICLES = {'the', 'a', 'an'}
|
||||
ipa_parts: List[str] = []
|
||||
for part in parts:
|
||||
# A part may be multi-word like "secondary school"
|
||||
words = part.split()
|
||||
word_ipas: List[str] = []
|
||||
for w in words:
|
||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean_w or len(clean_w) < 2:
|
||||
continue
|
||||
if covered_words and clean_w.lower() in covered_words:
|
||||
continue # Already has IPA inline in the headword
|
||||
if clean_w.lower() in _ARTICLES:
|
||||
continue # Articles never get IPA in vocab books
|
||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||
if ipa:
|
||||
word_ipas.append(ipa)
|
||||
if word_ipas:
|
||||
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
|
||||
|
||||
if not ipa_parts:
|
||||
return garbled_text
|
||||
|
||||
# Join with delimiter
|
||||
result = ' – '.join(ipa_parts)
|
||||
logger.debug(
|
||||
"fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
|
||||
garbled_text, result, headword_text,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA for the first English headword in a long mixed-language line.
|
||||
|
||||
Unlike _insert_missing_ipa (for short column_en cells), this handles
|
||||
column_text lines of any length. It only inserts IPA for the FIRST word
|
||||
if that word:
|
||||
- has no bracket following it already
|
||||
- has an IPA entry in the dictionary
|
||||
- is not a number/symbol prefix like "».55"
|
||||
|
||||
Returns the text with [ipa] inserted after the first word, or unchanged.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
words = text.strip().split()
|
||||
if not words:
|
||||
return text
|
||||
|
||||
# Check if text already starts with a bracket (IPA already present)
|
||||
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
|
||||
return text
|
||||
|
||||
# Try the first few words (skip numeric prefixes like "».55", "0.56")
|
||||
for i in range(min(3, len(words))):
|
||||
w = words[i]
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
return ' '.join(words)
|
||||
# Stop at first real word even if no IPA found
|
||||
break
|
||||
|
||||
return text
|
||||
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
|
||||
|
||||
- Character confusion fix (I/1/l/|)
|
||||
- Comma-separated word form splitting
|
||||
- Example sentence attachment to matching vocab entries
|
||||
|
||||
Split from cv_ocr_engines.py for maintainability.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Post-Processing: Deterministic Quality Fixes
|
||||
# =============================================================================
|
||||
|
||||
# --- A. Character Confusion Fix (I/1/l) ---
|
||||
|
||||
# Common OCR confusion pairs in vocabulary context
|
||||
_CHAR_CONFUSION_RULES = [
|
||||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
|
||||
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
]
|
||||
|
||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||||
|
||||
|
||||
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Fix common OCR character confusions using context.
|
||||
|
||||
Deterministic rules:
|
||||
- "1" at word start → "I" or "l" based on context
|
||||
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||||
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||||
"""
|
||||
for entry in entries:
|
||||
en = entry.get('english', '') or ''
|
||||
de = entry.get('german', '') or ''
|
||||
ex = entry.get('example', '') or ''
|
||||
|
||||
# Apply general rules to all fields
|
||||
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||||
en = pattern.sub(replacement, en)
|
||||
de = pattern.sub(replacement, de)
|
||||
ex = pattern.sub(replacement, ex)
|
||||
|
||||
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||||
# Any remaining "1" in EN that looks like "I"
|
||||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||||
|
||||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||||
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||||
|
||||
entry['english'] = en.strip()
|
||||
entry['german'] = de.strip()
|
||||
entry['example'] = ex.strip()
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# --- B. Comma-Separated Word Form Splitting ---
|
||||
|
||||
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||||
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||||
|
||||
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||||
"break, broke, broken" → False (different verb forms, OK to split).
|
||||
|
||||
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||||
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||||
"""
|
||||
if len(parts) != 2:
|
||||
return False
|
||||
|
||||
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||||
if not a or not b:
|
||||
return False
|
||||
|
||||
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||||
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||||
min_len = min(len(a), len(b))
|
||||
common = 0
|
||||
for ca, cb in zip(a, b):
|
||||
if ca == cb:
|
||||
common += 1
|
||||
else:
|
||||
break
|
||||
if common >= max(2, min_len * 0.5):
|
||||
return True
|
||||
|
||||
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||||
umlaut_map = str.maketrans('aou', 'äöü')
|
||||
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Split entries with comma-separated word forms into individual entries.
|
||||
|
||||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||||
|
||||
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||||
because those are forms of the same vocabulary entry.
|
||||
|
||||
Only splits when both EN and DE have the same number of comma-parts,
|
||||
parts are short (word forms, not sentences), and at least 3 parts
|
||||
(to avoid splitting pairs that likely belong together).
|
||||
"""
|
||||
result: List[Dict[str, Any]] = []
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
|
||||
# Split by comma (but not inside brackets or parentheses)
|
||||
en_parts = _split_by_comma(en)
|
||||
de_parts = _split_by_comma(de)
|
||||
|
||||
# Only split if we have multiple parts and counts match
|
||||
should_split = False
|
||||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||||
# All parts must be short (word forms, not sentences)
|
||||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||||
# Do NOT split singular/plural pairs (2 parts that are
|
||||
# forms of the same word)
|
||||
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||||
should_split = False
|
||||
else:
|
||||
should_split = True
|
||||
|
||||
if not should_split:
|
||||
result.append(entry)
|
||||
continue
|
||||
|
||||
# Split into individual entries
|
||||
for k in range(len(en_parts)):
|
||||
sub = dict(entry) # shallow copy
|
||||
sub['english'] = en_parts[k].strip()
|
||||
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||||
sub['example'] = '' # examples get attached later
|
||||
sub['split_from_comma'] = True
|
||||
result.append(sub)
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(result):
|
||||
e['row_index'] = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_by_comma(text: str) -> List[str]:
|
||||
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||||
if ',' not in text:
|
||||
return [text]
|
||||
|
||||
parts = []
|
||||
depth_bracket = 0
|
||||
depth_paren = 0
|
||||
current = []
|
||||
|
||||
for ch in text:
|
||||
if ch == '[':
|
||||
depth_bracket += 1
|
||||
elif ch == ']':
|
||||
depth_bracket = max(0, depth_bracket - 1)
|
||||
elif ch == '(':
|
||||
depth_paren += 1
|
||||
elif ch == ')':
|
||||
depth_paren = max(0, depth_paren - 1)
|
||||
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||||
parts.append(''.join(current).strip())
|
||||
current = []
|
||||
continue
|
||||
current.append(ch)
|
||||
|
||||
if current:
|
||||
parts.append(''.join(current).strip())
|
||||
|
||||
# Filter empty parts
|
||||
return [p for p in parts if p]
|
||||
|
||||
|
||||
# --- C. Example Sentence Attachment ---
|
||||
|
||||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||||
|
||||
Returns index into vocab_entries, or -1 if no match found.
|
||||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||||
"""
|
||||
if not vocab_entries or not example_text:
|
||||
return -1
|
||||
|
||||
example_lower = example_text.lower()
|
||||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||||
|
||||
best_idx = -1
|
||||
best_score = 0
|
||||
|
||||
for i, entry in enumerate(vocab_entries):
|
||||
en = (entry.get('english', '') or '').lower()
|
||||
if not en:
|
||||
continue
|
||||
|
||||
# Extract vocab words (split on space, comma, newline)
|
||||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||||
|
||||
# Score: how many vocab words appear in the example?
|
||||
# Also check if example words share a common stem (first 4 chars)
|
||||
direct_matches = vocab_words & example_words
|
||||
score = len(direct_matches) * 10
|
||||
|
||||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||||
if score == 0:
|
||||
for vw in vocab_words:
|
||||
if len(vw) < 3:
|
||||
continue
|
||||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||||
for ew in example_words:
|
||||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||||
score += 5
|
||||
break
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_idx = i
|
||||
|
||||
return best_idx if best_score > 0 else -1
|
||||
|
||||
|
||||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||||
|
||||
Vocabulary worksheets often have:
|
||||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||||
Row 2: a broken arm (no DE → example for "broken")
|
||||
Row 3: a broken plate (no DE → example for "broken")
|
||||
Row 4: egg / Ei (has DE → new vocab entry)
|
||||
|
||||
Rules (deterministic, generic):
|
||||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||||
- Find the best matching vocab entry by checking which entry's English words
|
||||
appear in the example sentence (semantic matching via word overlap)
|
||||
- Fall back to the nearest preceding entry if no word match found
|
||||
- Multiple examples get joined with " | "
|
||||
"""
|
||||
if not entries:
|
||||
return entries
|
||||
|
||||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||||
vocab_entries: List[Dict[str, Any]] = []
|
||||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
ex = (entry.get('example', '') or '').strip()
|
||||
|
||||
# Treat single-char DE as OCR noise, not real translation.
|
||||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||||
has_de = len(de) > 1
|
||||
has_en = bool(en)
|
||||
|
||||
# Heuristic: a row without DE is an "example sentence" only if
|
||||
# the EN text looks like a sentence (>= 4 words, or contains
|
||||
# typical sentence punctuation). Short EN text (1-3 words) is
|
||||
# more likely a vocab entry whose DE was missed by OCR.
|
||||
_looks_like_sentence = (
|
||||
len(en.split()) >= 4
|
||||
or en.rstrip().endswith(('.', '!', '?'))
|
||||
)
|
||||
is_example_candidate = (
|
||||
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||||
)
|
||||
|
||||
if is_example_candidate:
|
||||
# This is an example sentence — find best matching vocab entry
|
||||
example_text = en
|
||||
|
||||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||||
if match_idx < 0:
|
||||
# No word match → fall back to last entry
|
||||
match_idx = len(vocab_entries) - 1
|
||||
|
||||
if match_idx not in examples_for:
|
||||
examples_for[match_idx] = []
|
||||
examples_for[match_idx].append(example_text)
|
||||
else:
|
||||
vocab_entries.append(entry)
|
||||
|
||||
# Attach examples to their matched vocab entries
|
||||
for idx, example_list in examples_for.items():
|
||||
if 0 <= idx < len(vocab_entries):
|
||||
entry = vocab_entries[idx]
|
||||
existing_ex = (entry.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(example_list)
|
||||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(vocab_entries):
|
||||
e['row_index'] = i
|
||||
|
||||
return vocab_entries
|
||||
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Word assembly helpers for OCR output.
|
||||
|
||||
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
|
||||
into visual lines, rejoins hyphenated words, and produces reading-order
|
||||
text. All functions are pure standard-library; no NumPy or project
|
||||
imports required.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||
"""Group words by Y position into lines, sorted by X within each line."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||
"""Group OCR words into visual lines in reading order.
|
||||
|
||||
Returns a list of line strings (one per visual line in the cell).
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
return [' '.join(w['text'] for w in line) for line in lines]
|
||||
|
||||
|
||||
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||||
"""Rejoin words split by line-break hyphenation.
|
||||
|
||||
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
|
||||
['some text-', 'thing here'] \u2192 ['something here']
|
||||
"""
|
||||
if len(lines) <= 1:
|
||||
return lines
|
||||
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
# If line ends with '-' and there's a next line, rejoin
|
||||
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||||
stripped = line.rstrip()
|
||||
# Get the word fragment before hyphen (last word)
|
||||
prefix = stripped[:-1] # remove trailing hyphen
|
||||
next_line = lines[i + 1]
|
||||
# Join: last word of this line + first word of next line
|
||||
prefix_words = prefix.rsplit(' ', 1)
|
||||
next_words = next_line.split(' ', 1)
|
||||
if len(prefix_words) > 1:
|
||||
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||||
else:
|
||||
joined = prefix_words[0] + next_words[0]
|
||||
remainder = next_words[1] if len(next_words) > 1 else ''
|
||||
if remainder:
|
||||
result.append(joined + ' ' + remainder)
|
||||
else:
|
||||
result.append(joined)
|
||||
i += 2
|
||||
else:
|
||||
result.append(line)
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||||
|
||||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||||
rejoins hyphenated words, then joins lines with newlines.
|
||||
"""
|
||||
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||||
lines = _rejoin_hyphenated(lines)
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words preserving proportional horizontal spacing.
|
||||
|
||||
Instead of single spaces between words, inserts multiple spaces based on
|
||||
the pixel gap between words relative to average character width.
|
||||
Useful for box sub-sessions where spatial layout matters.
|
||||
"""
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
result_lines = []
|
||||
|
||||
for line_words in lines:
|
||||
if not line_words:
|
||||
continue
|
||||
sorted_words = sorted(line_words, key=lambda w: w['left'])
|
||||
|
||||
# Calculate average character width from all words in line
|
||||
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
|
||||
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
|
||||
avg_char_width = total_width / total_chars if total_chars > 0 else 10
|
||||
|
||||
parts = []
|
||||
for i, word in enumerate(sorted_words):
|
||||
parts.append(word.get('text', ''))
|
||||
if i < len(sorted_words) - 1:
|
||||
next_word = sorted_words[i + 1]
|
||||
gap_px = next_word['left'] - (word['left'] + word['width'])
|
||||
num_spaces = max(1, round(gap_px / avg_char_width))
|
||||
parts.append(' ' * num_spaces)
|
||||
|
||||
result_lines.append(''.join(parts))
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
@@ -0,0 +1,275 @@
|
||||
"""
|
||||
Gutter Repair Core — spellchecker setup, data types, and single-word repair logic.
|
||||
|
||||
Extracted from cv_gutter_repair.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Spellchecker setup (lazy, cached)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_spell_de = None
|
||||
_spell_en = None
|
||||
_SPELL_AVAILABLE = False
|
||||
|
||||
def _init_spellcheckers():
|
||||
"""Lazy-load DE + EN spellcheckers (cached across calls)."""
|
||||
global _spell_de, _spell_en, _SPELL_AVAILABLE
|
||||
if _spell_de is not None:
|
||||
return
|
||||
try:
|
||||
from spellchecker import SpellChecker
|
||||
_spell_de = SpellChecker(language='de', distance=1)
|
||||
_spell_en = SpellChecker(language='en', distance=1)
|
||||
_SPELL_AVAILABLE = True
|
||||
logger.info("Gutter repair: spellcheckers loaded (DE + EN)")
|
||||
except ImportError:
|
||||
logger.warning("pyspellchecker not installed — gutter repair unavailable")
|
||||
|
||||
|
||||
def _is_known(word: str) -> bool:
|
||||
"""Check if a word is known in DE or EN dictionary."""
|
||||
_init_spellcheckers()
|
||||
if not _SPELL_AVAILABLE:
|
||||
return False
|
||||
w = word.lower()
|
||||
return bool(_spell_de.known([w])) or bool(_spell_en.known([w]))
|
||||
|
||||
|
||||
def _spell_candidates(word: str, lang: str = "both") -> List[str]:
|
||||
"""Get all plausible spellchecker candidates for a word (deduplicated)."""
|
||||
_init_spellcheckers()
|
||||
if not _SPELL_AVAILABLE:
|
||||
return []
|
||||
w = word.lower()
|
||||
seen: set = set()
|
||||
results: List[str] = []
|
||||
|
||||
for checker in ([_spell_de, _spell_en] if lang == "both"
|
||||
else [_spell_de] if lang == "de"
|
||||
else [_spell_en]):
|
||||
if checker is None:
|
||||
continue
|
||||
cands = checker.candidates(w)
|
||||
if cands:
|
||||
for c in cands:
|
||||
if c and c != w and c not in seen:
|
||||
seen.add(c)
|
||||
results.append(c)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gutter position detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Minimum word length for spell-fix (very short words are often legitimate)
|
||||
_MIN_WORD_LEN_SPELL = 3
|
||||
|
||||
# Minimum word length for hyphen-join candidates (fragments at the gutter
|
||||
# can be as short as 1-2 chars, e.g. "ve" from "ver-künden")
|
||||
_MIN_WORD_LEN_HYPHEN = 2
|
||||
|
||||
# How close to the right column edge a word must be to count as "gutter-adjacent".
|
||||
# Expressed as fraction of column width (e.g. 0.75 = rightmost 25%).
|
||||
_GUTTER_EDGE_THRESHOLD = 0.70
|
||||
|
||||
# Small common words / abbreviations that should NOT be repaired
|
||||
_STOPWORDS = frozenset([
|
||||
# German
|
||||
"ab", "an", "am", "da", "er", "es", "im", "in", "ja", "ob", "so", "um",
|
||||
"zu", "wo", "du", "eh", "ei", "je", "na", "nu", "oh",
|
||||
# English
|
||||
"a", "am", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in",
|
||||
"is", "it", "me", "my", "no", "of", "on", "or", "so", "to", "up", "us",
|
||||
"we",
|
||||
])
|
||||
|
||||
# IPA / phonetic patterns — skip these cells
|
||||
_IPA_RE = re.compile(r'[\[\]/ˈˌːʃʒθðŋɑɒæɔəɛɪʊʌ]')
|
||||
|
||||
|
||||
def _is_ipa_text(text: str) -> bool:
|
||||
"""True if text looks like IPA transcription."""
|
||||
return bool(_IPA_RE.search(text))
|
||||
|
||||
|
||||
def _word_is_at_gutter_edge(word_bbox: Dict, col_x: float, col_width: float) -> bool:
|
||||
"""Check if a word's right edge is near the right boundary of its column."""
|
||||
if col_width <= 0:
|
||||
return False
|
||||
word_right = word_bbox.get("left", 0) + word_bbox.get("width", 0)
|
||||
col_right = col_x + col_width
|
||||
# Word's right edge within the rightmost portion of the column
|
||||
relative_pos = (word_right - col_x) / col_width
|
||||
return relative_pos >= _GUTTER_EDGE_THRESHOLD
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Suggestion types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class GutterSuggestion:
|
||||
"""A single correction suggestion."""
|
||||
id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
|
||||
type: str = "" # "hyphen_join" | "spell_fix"
|
||||
zone_index: int = 0
|
||||
row_index: int = 0
|
||||
col_index: int = 0
|
||||
col_type: str = ""
|
||||
cell_id: str = ""
|
||||
original_text: str = ""
|
||||
suggested_text: str = ""
|
||||
# For hyphen_join:
|
||||
next_row_index: int = -1
|
||||
next_row_cell_id: str = ""
|
||||
next_row_text: str = ""
|
||||
missing_chars: str = ""
|
||||
display_parts: List[str] = field(default_factory=list)
|
||||
# Alternatives (other plausible corrections the user can pick from)
|
||||
alternatives: List[str] = field(default_factory=list)
|
||||
# Meta:
|
||||
confidence: float = 0.0
|
||||
reason: str = "" # "gutter_truncation" | "gutter_blur" | "hyphen_continuation"
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core repair logic
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TRAILING_PUNCT_RE = re.compile(r'[.,;:!?\)\]]+$')
|
||||
|
||||
|
||||
def _try_hyphen_join(
|
||||
word_text: str,
|
||||
next_word_text: str,
|
||||
max_missing: int = 3,
|
||||
) -> Optional[Tuple[str, str, float]]:
|
||||
"""Try joining two fragments with 0..max_missing interpolated chars.
|
||||
|
||||
Strips trailing punctuation from the continuation word before testing
|
||||
(e.g. "künden," → "künden") so dictionary lookup succeeds.
|
||||
|
||||
Returns (joined_word, missing_chars, confidence) or None.
|
||||
"""
|
||||
base = word_text.rstrip("-").rstrip()
|
||||
# Strip trailing punctuation from continuation (commas, periods, etc.)
|
||||
raw_continuation = next_word_text.lstrip()
|
||||
continuation = _TRAILING_PUNCT_RE.sub('', raw_continuation)
|
||||
|
||||
if not base or not continuation:
|
||||
return None
|
||||
|
||||
# 1. Direct join (no missing chars)
|
||||
direct = base + continuation
|
||||
if _is_known(direct):
|
||||
return (direct, "", 0.95)
|
||||
|
||||
# 2. Try with 1..max_missing missing characters
|
||||
# Use common letters, weighted by frequency in German/English
|
||||
_COMMON_CHARS = "enristaldhgcmobwfkzpvjyxqu"
|
||||
|
||||
for n_missing in range(1, max_missing + 1):
|
||||
for chars in itertools.product(_COMMON_CHARS[:15], repeat=n_missing):
|
||||
candidate = base + "".join(chars) + continuation
|
||||
if _is_known(candidate):
|
||||
missing = "".join(chars)
|
||||
# Confidence decreases with more missing chars
|
||||
conf = 0.90 - (n_missing - 1) * 0.10
|
||||
return (candidate, missing, conf)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _try_spell_fix(
|
||||
word_text: str, col_type: str = "",
|
||||
) -> Optional[Tuple[str, float, List[str]]]:
|
||||
"""Try to fix a single garbled gutter word via spellchecker.
|
||||
|
||||
Returns (best_correction, confidence, alternatives_list) or None.
|
||||
The alternatives list contains other plausible corrections the user
|
||||
can choose from (e.g. "stammelt" vs "stammeln").
|
||||
"""
|
||||
if len(word_text) < _MIN_WORD_LEN_SPELL:
|
||||
return None
|
||||
|
||||
# Strip trailing/leading parentheses and check if the bare word is valid.
|
||||
# Words like "probieren)" or "(Englisch" are valid words with punctuation,
|
||||
# not OCR errors. Don't suggest corrections for them.
|
||||
stripped = word_text.strip("()")
|
||||
if stripped and _is_known(stripped):
|
||||
return None
|
||||
|
||||
# Determine language priority from column type
|
||||
if "en" in col_type:
|
||||
lang = "en"
|
||||
elif "de" in col_type:
|
||||
lang = "de"
|
||||
else:
|
||||
lang = "both"
|
||||
|
||||
candidates = _spell_candidates(word_text, lang=lang)
|
||||
if not candidates and lang != "both":
|
||||
candidates = _spell_candidates(word_text, lang="both")
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Preserve original casing
|
||||
is_upper = word_text[0].isupper()
|
||||
|
||||
def _preserve_case(w: str) -> str:
|
||||
if is_upper and w:
|
||||
return w[0].upper() + w[1:]
|
||||
return w
|
||||
|
||||
# Sort candidates by edit distance (closest first)
|
||||
scored = []
|
||||
for c in candidates:
|
||||
dist = _edit_distance(word_text.lower(), c.lower())
|
||||
scored.append((dist, c))
|
||||
scored.sort(key=lambda x: x[0])
|
||||
|
||||
best_dist, best = scored[0]
|
||||
best = _preserve_case(best)
|
||||
conf = max(0.5, 1.0 - best_dist * 0.15)
|
||||
|
||||
# Build alternatives (all other candidates, also case-preserved)
|
||||
alts = [_preserve_case(c) for _, c in scored[1:] if c.lower() != best.lower()]
|
||||
# Limit to top 5 alternatives
|
||||
alts = alts[:5]
|
||||
|
||||
return (best, conf, alts)
|
||||
|
||||
|
||||
def _edit_distance(a: str, b: str) -> int:
|
||||
"""Simple Levenshtein distance."""
|
||||
if len(a) < len(b):
|
||||
return _edit_distance(b, a)
|
||||
if len(b) == 0:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a):
|
||||
curr = [i + 1]
|
||||
for j, cb in enumerate(b):
|
||||
cost = 0 if ca == cb else 1
|
||||
curr.append(min(curr[j] + 1, prev[j + 1] + 1, prev[j] + cost))
|
||||
prev = curr
|
||||
return prev[len(b)]
|
||||
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Gutter Repair Grid — grid analysis and suggestion application.
|
||||
|
||||
Extracted from cv_gutter_repair.py for modularity.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from cv_gutter_repair_core import (
|
||||
_init_spellcheckers,
|
||||
_is_ipa_text,
|
||||
_is_known,
|
||||
_MIN_WORD_LEN_HYPHEN,
|
||||
_SPELL_AVAILABLE,
|
||||
_STOPWORDS,
|
||||
_TRAILING_PUNCT_RE,
|
||||
_try_hyphen_join,
|
||||
_try_spell_fix,
|
||||
_word_is_at_gutter_edge,
|
||||
GutterSuggestion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Grid analysis
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def analyse_grid_for_gutter_repair(
|
||||
grid_data: Dict[str, Any],
|
||||
image_width: int = 0,
|
||||
) -> Dict[str, Any]:
|
||||
"""Analyse a structured grid and return gutter repair suggestions.
|
||||
|
||||
Args:
|
||||
grid_data: The grid_editor_result from the session (zones→cells structure).
|
||||
image_width: Image width in pixels (for determining gutter side).
|
||||
|
||||
Returns:
|
||||
Dict with "suggestions" list and "stats".
|
||||
"""
|
||||
t0 = time.time()
|
||||
_init_spellcheckers()
|
||||
|
||||
if not _SPELL_AVAILABLE:
|
||||
return {
|
||||
"suggestions": [],
|
||||
"stats": {"error": "pyspellchecker not installed"},
|
||||
"duration_seconds": 0,
|
||||
}
|
||||
|
||||
zones = grid_data.get("zones", [])
|
||||
suggestions: List[GutterSuggestion] = []
|
||||
words_checked = 0
|
||||
gutter_candidates = 0
|
||||
|
||||
for zi, zone in enumerate(zones):
|
||||
columns = zone.get("columns", [])
|
||||
cells = zone.get("cells", [])
|
||||
if not columns or not cells:
|
||||
continue
|
||||
|
||||
# Build column lookup: col_index → {x, width, type}
|
||||
col_info: Dict[int, Dict] = {}
|
||||
for col in columns:
|
||||
ci = col.get("index", col.get("col_index", -1))
|
||||
col_info[ci] = {
|
||||
"x": col.get("x_min_px", col.get("x", 0)),
|
||||
"width": col.get("x_max_px", col.get("width", 0)) - col.get("x_min_px", col.get("x", 0)),
|
||||
"type": col.get("type", col.get("col_type", "")),
|
||||
}
|
||||
|
||||
# Build row→col→cell lookup
|
||||
cell_map: Dict[Tuple[int, int], Dict] = {}
|
||||
max_row = 0
|
||||
for cell in cells:
|
||||
ri = cell.get("row_index", 0)
|
||||
ci = cell.get("col_index", 0)
|
||||
cell_map[(ri, ci)] = cell
|
||||
if ri > max_row:
|
||||
max_row = ri
|
||||
|
||||
# Determine which columns are at the gutter edge.
|
||||
# For a left page: rightmost content columns.
|
||||
# For now, check ALL columns — a word is a candidate if it's at the
|
||||
# right edge of its column AND not a known word.
|
||||
for (ri, ci), cell in cell_map.items():
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
if _is_ipa_text(text):
|
||||
continue
|
||||
|
||||
words_checked += 1
|
||||
col = col_info.get(ci, {})
|
||||
col_type = col.get("type", "")
|
||||
|
||||
# Get word boxes to check position
|
||||
word_boxes = cell.get("word_boxes", [])
|
||||
|
||||
# Check the LAST word in the cell (rightmost, closest to gutter)
|
||||
cell_words = text.split()
|
||||
if not cell_words:
|
||||
continue
|
||||
|
||||
last_word = cell_words[-1]
|
||||
|
||||
# Skip stopwords
|
||||
if last_word.lower().rstrip(".,;:!?-") in _STOPWORDS:
|
||||
continue
|
||||
|
||||
last_word_clean = last_word.rstrip(".,;:!?)(")
|
||||
if len(last_word_clean) < _MIN_WORD_LEN_HYPHEN:
|
||||
continue
|
||||
|
||||
# Check if the last word is at the gutter edge
|
||||
is_at_edge = False
|
||||
if word_boxes:
|
||||
last_wb = word_boxes[-1]
|
||||
is_at_edge = _word_is_at_gutter_edge(
|
||||
last_wb, col.get("x", 0), col.get("width", 1)
|
||||
)
|
||||
else:
|
||||
# No word boxes — use cell bbox
|
||||
bbox = cell.get("bbox_px", {})
|
||||
is_at_edge = _word_is_at_gutter_edge(
|
||||
{"left": bbox.get("x", 0), "width": bbox.get("w", 0)},
|
||||
col.get("x", 0), col.get("width", 1)
|
||||
)
|
||||
|
||||
if not is_at_edge:
|
||||
continue
|
||||
|
||||
# Word is at gutter edge — check if it's a known word
|
||||
if _is_known(last_word_clean):
|
||||
continue
|
||||
|
||||
# Check if the word ends with "-" (explicit hyphen break)
|
||||
ends_with_hyphen = last_word.endswith("-")
|
||||
|
||||
# If the word already ends with "-" and the stem (without
|
||||
# the hyphen) is a known word, this is a VALID line-break
|
||||
# hyphenation — not a gutter error. Gutter problems cause
|
||||
# the hyphen to be LOST ("ve" instead of "ver-"), so a
|
||||
# visible hyphen + known stem = intentional word-wrap.
|
||||
# Example: "wunder-" → "wunder" is known → skip.
|
||||
if ends_with_hyphen:
|
||||
stem = last_word_clean.rstrip("-")
|
||||
if stem and _is_known(stem):
|
||||
continue
|
||||
|
||||
gutter_candidates += 1
|
||||
|
||||
# --- Strategy 1: Hyphen join with next row ---
|
||||
next_cell = cell_map.get((ri + 1, ci))
|
||||
if next_cell:
|
||||
next_text = (next_cell.get("text") or "").strip()
|
||||
next_words = next_text.split()
|
||||
if next_words:
|
||||
first_next = next_words[0]
|
||||
first_next_clean = _TRAILING_PUNCT_RE.sub('', first_next)
|
||||
first_alpha = next((c for c in first_next if c.isalpha()), "")
|
||||
|
||||
# Also skip if the joined word is known (covers compound
|
||||
# words where the stem alone might not be in the dictionary)
|
||||
if ends_with_hyphen and first_next_clean:
|
||||
direct = last_word_clean.rstrip("-") + first_next_clean
|
||||
if _is_known(direct):
|
||||
continue
|
||||
|
||||
# Continuation likely if:
|
||||
# - explicit hyphen, OR
|
||||
# - next row starts lowercase (= not a new entry)
|
||||
if ends_with_hyphen or (first_alpha and first_alpha.islower()):
|
||||
result = _try_hyphen_join(last_word_clean, first_next)
|
||||
if result:
|
||||
joined, missing, conf = result
|
||||
# Build display parts: show hyphenation for original layout
|
||||
if ends_with_hyphen:
|
||||
display_p1 = last_word_clean.rstrip("-")
|
||||
if missing:
|
||||
display_p1 += missing
|
||||
display_p1 += "-"
|
||||
else:
|
||||
display_p1 = last_word_clean
|
||||
if missing:
|
||||
display_p1 += missing + "-"
|
||||
else:
|
||||
display_p1 += "-"
|
||||
|
||||
suggestion = GutterSuggestion(
|
||||
type="hyphen_join",
|
||||
zone_index=zi,
|
||||
row_index=ri,
|
||||
col_index=ci,
|
||||
col_type=col_type,
|
||||
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
||||
original_text=last_word,
|
||||
suggested_text=joined,
|
||||
next_row_index=ri + 1,
|
||||
next_row_cell_id=next_cell.get("cell_id", f"R{ri+1:02d}_C{ci}"),
|
||||
next_row_text=next_text,
|
||||
missing_chars=missing,
|
||||
display_parts=[display_p1, first_next],
|
||||
confidence=conf,
|
||||
reason="gutter_truncation" if missing else "hyphen_continuation",
|
||||
)
|
||||
suggestions.append(suggestion)
|
||||
continue # skip spell_fix if hyphen_join found
|
||||
|
||||
# --- Strategy 2: Single-word spell fix (only for longer words) ---
|
||||
fix_result = _try_spell_fix(last_word_clean, col_type)
|
||||
if fix_result:
|
||||
corrected, conf, alts = fix_result
|
||||
suggestion = GutterSuggestion(
|
||||
type="spell_fix",
|
||||
zone_index=zi,
|
||||
row_index=ri,
|
||||
col_index=ci,
|
||||
col_type=col_type,
|
||||
cell_id=cell.get("cell_id", f"R{ri:02d}_C{ci}"),
|
||||
original_text=last_word,
|
||||
suggested_text=corrected,
|
||||
alternatives=alts,
|
||||
confidence=conf,
|
||||
reason="gutter_blur",
|
||||
)
|
||||
suggestions.append(suggestion)
|
||||
|
||||
duration = round(time.time() - t0, 3)
|
||||
|
||||
logger.info(
|
||||
"Gutter repair: checked %d words, %d gutter candidates, %d suggestions (%.2fs)",
|
||||
words_checked, gutter_candidates, len(suggestions), duration,
|
||||
)
|
||||
|
||||
return {
|
||||
"suggestions": [s.to_dict() for s in suggestions],
|
||||
"stats": {
|
||||
"words_checked": words_checked,
|
||||
"gutter_candidates": gutter_candidates,
|
||||
"suggestions_found": len(suggestions),
|
||||
},
|
||||
"duration_seconds": duration,
|
||||
}
|
||||
|
||||
|
||||
def apply_gutter_suggestions(
|
||||
grid_data: Dict[str, Any],
|
||||
accepted_ids: List[str],
|
||||
suggestions: List[Dict[str, Any]],
|
||||
) -> Dict[str, Any]:
|
||||
"""Apply accepted gutter repair suggestions to the grid data.
|
||||
|
||||
Modifies cells in-place and returns summary of changes.
|
||||
|
||||
Args:
|
||||
grid_data: The grid_editor_result (zones→cells).
|
||||
accepted_ids: List of suggestion IDs the user accepted.
|
||||
suggestions: The full suggestions list (from analyse_grid_for_gutter_repair).
|
||||
|
||||
Returns:
|
||||
Dict with "applied_count" and "changes" list.
|
||||
"""
|
||||
accepted_set = set(accepted_ids)
|
||||
accepted_suggestions = [s for s in suggestions if s.get("id") in accepted_set]
|
||||
|
||||
zones = grid_data.get("zones", [])
|
||||
changes: List[Dict[str, Any]] = []
|
||||
|
||||
for s in accepted_suggestions:
|
||||
zi = s.get("zone_index", 0)
|
||||
ri = s.get("row_index", 0)
|
||||
ci = s.get("col_index", 0)
|
||||
stype = s.get("type", "")
|
||||
|
||||
if zi >= len(zones):
|
||||
continue
|
||||
zone_cells = zones[zi].get("cells", [])
|
||||
|
||||
# Find the target cell
|
||||
target_cell = None
|
||||
for cell in zone_cells:
|
||||
if cell.get("row_index") == ri and cell.get("col_index") == ci:
|
||||
target_cell = cell
|
||||
break
|
||||
|
||||
if not target_cell:
|
||||
continue
|
||||
|
||||
old_text = target_cell.get("text", "")
|
||||
|
||||
if stype == "spell_fix":
|
||||
# Replace the last word in the cell text
|
||||
original_word = s.get("original_text", "")
|
||||
corrected = s.get("suggested_text", "")
|
||||
if original_word and corrected:
|
||||
# Replace from the right (last occurrence)
|
||||
idx = old_text.rfind(original_word)
|
||||
if idx >= 0:
|
||||
new_text = old_text[:idx] + corrected + old_text[idx + len(original_word):]
|
||||
target_cell["text"] = new_text
|
||||
changes.append({
|
||||
"type": "spell_fix",
|
||||
"zone_index": zi,
|
||||
"row_index": ri,
|
||||
"col_index": ci,
|
||||
"cell_id": target_cell.get("cell_id", ""),
|
||||
"old_text": old_text,
|
||||
"new_text": new_text,
|
||||
})
|
||||
|
||||
elif stype == "hyphen_join":
|
||||
# Current cell: replace last word with the hyphenated first part
|
||||
original_word = s.get("original_text", "")
|
||||
joined = s.get("suggested_text", "")
|
||||
display_parts = s.get("display_parts", [])
|
||||
next_ri = s.get("next_row_index", -1)
|
||||
|
||||
if not original_word or not joined or not display_parts:
|
||||
continue
|
||||
|
||||
# The first display part is what goes in the current row
|
||||
first_part = display_parts[0] if display_parts else ""
|
||||
|
||||
# Replace the last word in current cell with the restored form.
|
||||
# The next row is NOT modified — "künden" stays in its row
|
||||
# because the original book layout has it there. We only fix
|
||||
# the truncated word in the current row (e.g. "ve" → "ver-").
|
||||
idx = old_text.rfind(original_word)
|
||||
if idx >= 0:
|
||||
new_text = old_text[:idx] + first_part + old_text[idx + len(original_word):]
|
||||
target_cell["text"] = new_text
|
||||
changes.append({
|
||||
"type": "hyphen_join",
|
||||
"zone_index": zi,
|
||||
"row_index": ri,
|
||||
"col_index": ci,
|
||||
"cell_id": target_cell.get("cell_id", ""),
|
||||
"old_text": old_text,
|
||||
"new_text": new_text,
|
||||
"joined_word": joined,
|
||||
})
|
||||
|
||||
logger.info("Gutter repair applied: %d/%d suggestions", len(changes), len(accepted_suggestions))
|
||||
|
||||
return {
|
||||
"applied_count": len(accepted_suggestions),
|
||||
"changes": changes,
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
Gutter Repair — barrel re-export.
|
||||
|
||||
All implementation split into:
|
||||
cv_gutter_repair_core — spellchecker setup, data types, single-word repair
|
||||
cv_gutter_repair_grid — grid analysis, suggestion application
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
# Core: spellchecker, data types, repair helpers
|
||||
from cv_gutter_repair_core import ( # noqa: F401
|
||||
_init_spellcheckers,
|
||||
_is_known,
|
||||
_spell_candidates,
|
||||
_MIN_WORD_LEN_SPELL,
|
||||
_MIN_WORD_LEN_HYPHEN,
|
||||
_GUTTER_EDGE_THRESHOLD,
|
||||
_STOPWORDS,
|
||||
_IPA_RE,
|
||||
_is_ipa_text,
|
||||
_word_is_at_gutter_edge,
|
||||
GutterSuggestion,
|
||||
_TRAILING_PUNCT_RE,
|
||||
_try_hyphen_join,
|
||||
_try_spell_fix,
|
||||
_edit_distance,
|
||||
)
|
||||
|
||||
# Grid: analysis and application
|
||||
from cv_gutter_repair_grid import ( # noqa: F401
|
||||
analyse_grid_for_gutter_repair,
|
||||
apply_gutter_suggestions,
|
||||
)
|
||||
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
OCR Image Enhancement — Improve scan quality before OCR.
|
||||
|
||||
Applies CLAHE contrast enhancement + bilateral filter denoising
|
||||
to degraded scans. Only runs when scan_quality.is_degraded is True.
|
||||
|
||||
Pattern adapted from handwriting_htr_api.py (lines 50-68) and
|
||||
cv_layout.py (lines 229-241).
|
||||
|
||||
All operations use OpenCV (Apache-2.0).
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def enhance_for_ocr(
|
||||
img_bgr: np.ndarray,
|
||||
is_degraded: bool = False,
|
||||
clip_limit: float = 3.0,
|
||||
tile_size: int = 8,
|
||||
denoise_d: int = 9,
|
||||
denoise_sigma_color: float = 75,
|
||||
denoise_sigma_space: float = 75,
|
||||
sharpen: bool = True,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Enhance image quality for OCR processing.
|
||||
|
||||
Only applies aggressive enhancement when is_degraded is True.
|
||||
For good scans, applies minimal enhancement (light CLAHE only).
|
||||
|
||||
Args:
|
||||
img_bgr: Input BGR image
|
||||
is_degraded: Whether the scan is degraded (from ScanQualityReport)
|
||||
clip_limit: CLAHE clip limit (higher = more contrast)
|
||||
tile_size: CLAHE tile grid size
|
||||
denoise_d: Bilateral filter diameter
|
||||
denoise_sigma_color: Bilateral filter sigma for color
|
||||
denoise_sigma_space: Bilateral filter sigma for space
|
||||
sharpen: Apply unsharp mask for blurry scans
|
||||
|
||||
Returns:
|
||||
Enhanced BGR image
|
||||
"""
|
||||
if not is_degraded:
|
||||
# For good scans: light CLAHE only (preserves quality)
|
||||
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
|
||||
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
l_enhanced = clahe.apply(l_channel)
|
||||
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
|
||||
result = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
|
||||
logger.info("enhance_for_ocr: light CLAHE applied (good scan)")
|
||||
return result
|
||||
|
||||
# Degraded scan: full enhancement pipeline
|
||||
logger.info(
|
||||
f"enhance_for_ocr: full enhancement "
|
||||
f"(CLAHE clip={clip_limit}, denoise d={denoise_d}, sharpen={sharpen})"
|
||||
)
|
||||
|
||||
# 1. CLAHE on L-channel of LAB colorspace (preserves color for RapidOCR)
|
||||
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
|
||||
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||
clahe = cv2.createCLAHE(
|
||||
clipLimit=clip_limit,
|
||||
tileGridSize=(tile_size, tile_size),
|
||||
)
|
||||
l_enhanced = clahe.apply(l_channel)
|
||||
lab_enhanced = cv2.merge([l_enhanced, a_channel, b_channel])
|
||||
enhanced = cv2.cvtColor(lab_enhanced, cv2.COLOR_LAB2BGR)
|
||||
|
||||
# 2. Bilateral filter: denoises while preserving edges
|
||||
enhanced = cv2.bilateralFilter(
|
||||
enhanced,
|
||||
d=denoise_d,
|
||||
sigmaColor=denoise_sigma_color,
|
||||
sigmaSpace=denoise_sigma_space,
|
||||
)
|
||||
|
||||
# 3. Unsharp mask for sharpening blurry text
|
||||
if sharpen:
|
||||
gaussian = cv2.GaussianBlur(enhanced, (0, 0), 3)
|
||||
enhanced = cv2.addWeighted(enhanced, 1.5, gaussian, -0.5, 0)
|
||||
|
||||
logger.info("enhance_for_ocr: full enhancement pipeline complete")
|
||||
return enhanced
|
||||
@@ -0,0 +1,135 @@
|
||||
"""German IPA insertion for grid editor cells.
|
||||
|
||||
Hybrid approach:
|
||||
1. Primary lookup: wiki-pronunciation-dict (636k entries, CC-BY-SA)
|
||||
2. Fallback: epitran rule-based G2P (MIT license)
|
||||
|
||||
German IPA data sourced from Wiktionary contributors (CC-BY-SA 4.0).
|
||||
Attribution required — see grid editor UI.
|
||||
|
||||
Lizenz: Code Apache-2.0, IPA-Daten CC-BY-SA 4.0 (Wiktionary)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, List, Optional, Set
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# IPA/phonetic characters — skip cells that already contain IPA
|
||||
_IPA_RE = re.compile(r'[\[\]ˈˌːʃʒθðŋɑɒæɔəɛɜɪʊʌ]')
|
||||
|
||||
|
||||
def _lookup_ipa_de(word: str) -> Optional[str]:
|
||||
"""Look up German IPA for a single word.
|
||||
|
||||
Returns IPA string or None if not found.
|
||||
"""
|
||||
from cv_vocab_types import _de_ipa_dict, _epitran_de, DE_IPA_AVAILABLE
|
||||
|
||||
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||||
return None
|
||||
|
||||
lower = word.lower().strip()
|
||||
if not lower:
|
||||
return None
|
||||
|
||||
# 1. Dictionary lookup (636k entries)
|
||||
ipa = _de_ipa_dict.get(lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
|
||||
# 2. epitran fallback (rule-based)
|
||||
if _epitran_de is not None:
|
||||
try:
|
||||
result = _epitran_de.transliterate(word)
|
||||
if result and result != word.lower():
|
||||
return result
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _insert_ipa_for_text(text: str) -> str:
|
||||
"""Insert German IPA after each recognized word in a text string.
|
||||
|
||||
Handles comma-separated lists:
|
||||
"bildschön, blendend" → "bildschön [bɪltʃøn], blendend [blɛndənt]"
|
||||
|
||||
Skips cells already containing IPA brackets.
|
||||
"""
|
||||
if not text or _IPA_RE.search(text):
|
||||
return text
|
||||
|
||||
# Split on comma/semicolon sequences, keeping separators
|
||||
tokens = re.split(r'([,;:]+\s*)', text)
|
||||
result = []
|
||||
changed = False
|
||||
|
||||
for tok in tokens:
|
||||
# Keep separators as-is
|
||||
if not tok or re.match(r'^[,;:\s]+$', tok):
|
||||
result.append(tok)
|
||||
continue
|
||||
|
||||
# Process words within this token
|
||||
words = tok.split()
|
||||
new_words = []
|
||||
for w in words:
|
||||
# Strip punctuation for lookup
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', w)
|
||||
if len(clean) < 3:
|
||||
new_words.append(w)
|
||||
continue
|
||||
|
||||
ipa = _lookup_ipa_de(clean)
|
||||
if ipa:
|
||||
new_words.append(f"{w} [{ipa}]")
|
||||
changed = True
|
||||
else:
|
||||
new_words.append(w)
|
||||
|
||||
result.append(' '.join(new_words))
|
||||
|
||||
return ''.join(result) if changed else text
|
||||
|
||||
|
||||
def insert_german_ipa(
|
||||
cells: List[Dict],
|
||||
target_cols: Set[str],
|
||||
) -> int:
|
||||
"""Insert German IPA transcriptions into cells of target columns.
|
||||
|
||||
Args:
|
||||
cells: Flat list of all cells (modified in-place).
|
||||
target_cols: Set of col_type values to process.
|
||||
|
||||
Returns:
|
||||
Number of cells modified.
|
||||
"""
|
||||
from cv_vocab_types import DE_IPA_AVAILABLE, _epitran_de
|
||||
|
||||
if not DE_IPA_AVAILABLE and _epitran_de is None:
|
||||
logger.warning("German IPA not available — skipping")
|
||||
return 0
|
||||
|
||||
count = 0
|
||||
for cell in cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct not in target_cols:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
new_text = _insert_ipa_for_text(text)
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
cell["_ipa_corrected"] = True
|
||||
count += 1
|
||||
|
||||
if count:
|
||||
logger.info(f"German IPA inserted in {count} cells")
|
||||
return count
|
||||
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Legacy layout analysis using projection profiles.
|
||||
|
||||
Extracted from cv_layout_columns.py — contains:
|
||||
- analyze_layout() (projection-profile based column/header/footer detection)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion
|
||||
from cv_layout_detection import _find_content_bounds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
|
||||
"""Detect columns, header, and footer using projection profiles.
|
||||
|
||||
Uses content-bounds detection to exclude page margins before searching
|
||||
for column separators within the actual text area.
|
||||
|
||||
Args:
|
||||
layout_img: CLAHE-enhanced grayscale image.
|
||||
ocr_img: Binarized image for text density analysis.
|
||||
|
||||
Returns:
|
||||
List of PageRegion objects describing detected regions.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# Invert: black text on white → white text on black for projection
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
|
||||
# --- Find actual content bounds (exclude page margins) ---
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
|
||||
|
||||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||
# Fallback if detection seems wrong
|
||||
left_x, right_x = 0, w
|
||||
top_y, bottom_y = 0, h
|
||||
content_w, content_h = w, h
|
||||
|
||||
# --- Vertical projection within content area to find column separators ---
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection profile
|
||||
kernel_size = max(5, content_w // 50)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Debug: log projection profile statistics
|
||||
p_mean = float(np.mean(v_proj_smooth))
|
||||
p_median = float(np.median(v_proj_smooth))
|
||||
p_min = float(np.min(v_proj_smooth))
|
||||
p_max = float(np.max(v_proj_smooth))
|
||||
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
|
||||
f"mean={p_mean:.4f}, median={p_median:.4f}")
|
||||
|
||||
# Find valleys using multiple threshold strategies
|
||||
# Strategy 1: relative to median (catches clear separators)
|
||||
# Strategy 2: local minima approach (catches subtle gaps)
|
||||
threshold = max(p_median * 0.3, p_mean * 0.2)
|
||||
logger.info(f"Layout: valley threshold={threshold:.4f}")
|
||||
|
||||
in_valley = v_proj_smooth < threshold
|
||||
|
||||
# Find contiguous valley regions
|
||||
all_valleys = []
|
||||
start = None
|
||||
for x in range(len(v_proj_smooth)):
|
||||
if in_valley[x] and start is None:
|
||||
start = x
|
||||
elif not in_valley[x] and start is not None:
|
||||
valley_width = x - start
|
||||
valley_depth = float(np.min(v_proj_smooth[start:x]))
|
||||
# Valley must be at least 3px wide
|
||||
if valley_width >= 3:
|
||||
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
|
||||
start = None
|
||||
|
||||
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
|
||||
|
||||
# Filter: valleys must be inside the content area (not at edges)
|
||||
inner_margin = int(content_w * 0.08)
|
||||
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
|
||||
|
||||
# If no valleys found with strict threshold, try local minima approach
|
||||
if len(valleys) < 2:
|
||||
logger.info("Layout: trying local minima approach for column detection")
|
||||
# Divide content into 20 segments, find the 2 lowest
|
||||
seg_count = 20
|
||||
seg_width = content_w // seg_count
|
||||
seg_scores = []
|
||||
for i in range(seg_count):
|
||||
sx = i * seg_width
|
||||
ex = min((i + 1) * seg_width, content_w)
|
||||
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
|
||||
seg_scores.append((i, sx, ex, seg_mean))
|
||||
|
||||
seg_scores.sort(key=lambda s: s[3])
|
||||
logger.info(f"Layout: segment scores (lowest 5): "
|
||||
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
|
||||
|
||||
# Find two lowest non-adjacent segments that create reasonable columns
|
||||
candidate_valleys = []
|
||||
for seg_idx, sx, ex, seg_mean in seg_scores:
|
||||
# Must not be at the edges
|
||||
if seg_idx <= 1 or seg_idx >= seg_count - 2:
|
||||
continue
|
||||
# Must be significantly lower than overall mean
|
||||
if seg_mean < p_mean * 0.6:
|
||||
center = (sx + ex) // 2
|
||||
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
|
||||
|
||||
if len(candidate_valleys) >= 2:
|
||||
# Pick the best pair: non-adjacent, creating reasonable column widths
|
||||
candidate_valleys.sort(key=lambda v: v[2])
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(candidate_valleys)):
|
||||
for j in range(i + 1, len(candidate_valleys)):
|
||||
c1 = candidate_valleys[i][2]
|
||||
c2 = candidate_valleys[j][2]
|
||||
# Must be at least 20% apart
|
||||
if (c2 - c1) < content_w * 0.2:
|
||||
continue
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
# Each column at least 15%
|
||||
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
|
||||
continue
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (candidate_valleys[i], candidate_valleys[j])
|
||||
|
||||
if best_pair:
|
||||
valleys = list(best_pair)
|
||||
logger.info(f"Layout: local minima found 2 valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
logger.info(f"Layout: final {len(valleys)} valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
regions = []
|
||||
|
||||
if len(valleys) >= 2:
|
||||
# 3-column layout detected
|
||||
valleys.sort(key=lambda v: v[2])
|
||||
|
||||
if len(valleys) == 2:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
else:
|
||||
# Pick the two valleys that best divide into 3 parts
|
||||
# Prefer wider valleys (more likely true separators)
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(valleys)):
|
||||
for j in range(i + 1, len(valleys)):
|
||||
c1, c2 = valleys[i][2], valleys[j][2]
|
||||
# Each column should be at least 15% of content width
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
|
||||
continue
|
||||
# Score: lower is better (more even distribution)
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
# Bonus for wider valleys (subtract valley width)
|
||||
score -= (valleys[i][3] + valleys[j][3]) * 0.5
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (c1, c2)
|
||||
if best_pair:
|
||||
sep1_center, sep2_center = best_pair
|
||||
else:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
|
||||
# Convert from content-relative to absolute coordinates
|
||||
abs_sep1 = sep1_center + left_x
|
||||
abs_sep2 = sep2_center + left_x
|
||||
|
||||
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
|
||||
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep1, y=top_y,
|
||||
width=abs_sep2 - abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=abs_sep2, y=top_y,
|
||||
width=w - abs_sep2, height=content_h
|
||||
))
|
||||
|
||||
elif len(valleys) == 1:
|
||||
# 2-column layout
|
||||
abs_sep = valleys[0][2] + left_x
|
||||
|
||||
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep, y=top_y,
|
||||
width=w - abs_sep, height=content_h
|
||||
))
|
||||
|
||||
else:
|
||||
# No columns detected — run full-page OCR as single column
|
||||
logger.warning("Layout: no column separators found, using full page")
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=w, height=content_h
|
||||
))
|
||||
|
||||
# Add header/footer info (gap-based detection with fallback)
|
||||
# Lazy import to avoid circular dependency with cv_layout.py
|
||||
from cv_layout_detection import _add_header_footer
|
||||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||||
|
||||
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||||
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||||
col_count = len([r for r in regions if r.type.startswith('column')])
|
||||
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||||
|
||||
return regions
|
||||
@@ -0,0 +1,494 @@
|
||||
"""
|
||||
Column type classification for OCR layout analysis.
|
||||
|
||||
Entry point: classify_column_types() with 4-level fallback chain.
|
||||
Also provides positional_column_regions() and _build_margin_regions().
|
||||
Position-based classifiers (Level 2+3) in cv_layout_classify_position.py.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry, PageRegion
|
||||
|
||||
from cv_layout_scoring import (
|
||||
_score_language,
|
||||
_score_role,
|
||||
_score_dictionary_signals,
|
||||
_classify_dictionary_columns,
|
||||
)
|
||||
|
||||
from cv_layout_classify_position import (
|
||||
_classify_by_position_enhanced,
|
||||
_classify_by_position_fallback,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Margin Region Building
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_margin_regions(
|
||||
all_regions: List[PageRegion],
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
img_w: int,
|
||||
top_y: int,
|
||||
content_h: int,
|
||||
) -> List[PageRegion]:
|
||||
"""Create margin_left / margin_right PageRegions from content bounds.
|
||||
|
||||
Margins represent the space between the image edge and the first/last
|
||||
content column. They are used downstream for faithful page
|
||||
reconstruction but are skipped during OCR.
|
||||
"""
|
||||
margins: List[PageRegion] = []
|
||||
# Minimum gap (px) to create a margin region
|
||||
_min_gap = 5
|
||||
|
||||
if left_x > _min_gap:
|
||||
margins.append(PageRegion(
|
||||
type='margin_left', x=0, y=top_y,
|
||||
width=left_x, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='content_bounds',
|
||||
))
|
||||
|
||||
# Right margin: from end of last content column to image edge
|
||||
non_margin = [r for r in all_regions
|
||||
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
|
||||
'margin_top', 'margin_bottom')]
|
||||
if non_margin:
|
||||
last_col_end = max(r.x + r.width for r in non_margin)
|
||||
else:
|
||||
last_col_end = right_x
|
||||
if img_w - last_col_end > _min_gap:
|
||||
margins.append(PageRegion(
|
||||
type='margin_right', x=last_col_end, y=top_y,
|
||||
width=img_w - last_col_end, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='content_bounds',
|
||||
))
|
||||
|
||||
if margins:
|
||||
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
|
||||
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
|
||||
|
||||
return margins
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Positional Column Regions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def positional_column_regions(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
left_x: int,
|
||||
) -> List[PageRegion]:
|
||||
"""Classify columns by position only (no language scoring).
|
||||
|
||||
Structural columns (page_ref, column_marker) are identified by geometry.
|
||||
Remaining content columns are labelled left->right as column_en, column_de,
|
||||
column_example. The names are purely positional -- no language analysis.
|
||||
"""
|
||||
structural: List[PageRegion] = []
|
||||
content_cols: List[ColumnGeometry] = []
|
||||
|
||||
for g in geometries:
|
||||
rel_x = g.x - left_x
|
||||
# page_ref: narrow column in the leftmost 20% region
|
||||
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
|
||||
structural.append(PageRegion(
|
||||
type='page_ref', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
# column_marker: very narrow, few words
|
||||
elif g.width_ratio < 0.06 and g.word_count <= 15:
|
||||
structural.append(PageRegion(
|
||||
type='column_marker', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
# empty or near-empty narrow column -> treat as margin/structural
|
||||
elif g.word_count <= 2 and g.width_ratio < 0.15:
|
||||
structural.append(PageRegion(
|
||||
type='column_marker', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.85,
|
||||
classification_method='positional',
|
||||
))
|
||||
else:
|
||||
content_cols.append(g)
|
||||
|
||||
# Single content column -> plain text page
|
||||
if len(content_cols) == 1:
|
||||
g = content_cols[0]
|
||||
return structural + [PageRegion(
|
||||
type='column_text', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.9,
|
||||
classification_method='positional',
|
||||
)]
|
||||
|
||||
# No content columns
|
||||
if not content_cols:
|
||||
return structural
|
||||
|
||||
# Sort content columns left->right and assign positional labels
|
||||
content_cols.sort(key=lambda g: g.x)
|
||||
|
||||
# With exactly 2 content columns: if the left one is very wide (>35%),
|
||||
# it likely contains EN+DE combined, so the right one is examples.
|
||||
if (len(content_cols) == 2
|
||||
and content_cols[0].width_ratio > 0.35
|
||||
and content_cols[1].width_ratio > 0.20):
|
||||
labels = ['column_en', 'column_example']
|
||||
else:
|
||||
labels = ['column_en', 'column_de', 'column_example']
|
||||
|
||||
regions = list(structural)
|
||||
for i, g in enumerate(content_cols):
|
||||
label = labels[i] if i < len(labels) else 'column_example'
|
||||
regions.append(PageRegion(
|
||||
type=label, x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
|
||||
logger.info(f"PositionalColumns: {len(structural)} structural, "
|
||||
f"{len(content_cols)} content -> "
|
||||
f"{[r.type for r in regions]}")
|
||||
return regions
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main Classification Entry Point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def classify_column_types(geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
top_y: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
bottom_y: int,
|
||||
left_x: int = 0,
|
||||
right_x: int = 0,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
document_category: Optional[str] = None,
|
||||
margin_strip_detected: bool = False) -> List[PageRegion]:
|
||||
"""Classify column types using a 3-level fallback chain.
|
||||
|
||||
Level 0: Dictionary detection (if signals are strong enough)
|
||||
Level 1: Content-based (language + role scoring)
|
||||
Level 2: Position + language (old rules enhanced with language detection)
|
||||
Level 3: Pure position (exact old code, no regression)
|
||||
|
||||
Args:
|
||||
geometries: List of ColumnGeometry from Phase A.
|
||||
content_w: Total content width.
|
||||
top_y: Top Y of content area.
|
||||
img_w: Full image width.
|
||||
img_h: Full image height.
|
||||
bottom_y: Bottom Y of content area.
|
||||
left_x: Left content bound (from _find_content_bounds).
|
||||
right_x: Right content bound (from _find_content_bounds).
|
||||
document_category: User-selected category (e.g. 'woerterbuch').
|
||||
margin_strip_detected: Whether a decorative A-Z margin strip was found.
|
||||
|
||||
Returns:
|
||||
List of PageRegion with types, confidence, and method.
|
||||
"""
|
||||
# _add_header_footer lives in cv_layout (avoids circular import at module
|
||||
# level). Lazy-import here so the module can be tested independently when
|
||||
# cv_layout hasn't been modified yet.
|
||||
from cv_layout_detection import _add_header_footer # noqa: E402
|
||||
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
|
||||
"""Append margin_left / margin_right regions to *result*."""
|
||||
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
|
||||
return result + margins
|
||||
|
||||
# Special case: single column -> plain text page
|
||||
if len(geometries) == 1:
|
||||
geom = geometries[0]
|
||||
return _with_margins([PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=geom.height,
|
||||
classification_confidence=0.9,
|
||||
classification_method='content',
|
||||
)])
|
||||
|
||||
# --- Pre-filter: first/last columns with very few words -> column_ignore ---
|
||||
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
|
||||
# have few words (page refs, markers) and should not be discarded.
|
||||
ignore_regions = []
|
||||
active_geometries = []
|
||||
for idx, g in enumerate(geometries):
|
||||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_ignore', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='content',
|
||||
))
|
||||
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) -> column_ignore (edge, few words)")
|
||||
else:
|
||||
active_geometries.append(g)
|
||||
|
||||
# Re-index active geometries for classification
|
||||
for new_idx, g in enumerate(active_geometries):
|
||||
g.index = new_idx
|
||||
geometries = active_geometries
|
||||
|
||||
# Handle edge case: all columns ignored or only 1 left
|
||||
if len(geometries) == 0:
|
||||
return _with_margins(ignore_regions)
|
||||
if len(geometries) == 1:
|
||||
geom = geometries[0]
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=geom.height,
|
||||
classification_confidence=0.9,
|
||||
classification_method='content',
|
||||
))
|
||||
return _with_margins(ignore_regions)
|
||||
|
||||
# --- Score all columns ---
|
||||
lang_scores = [_score_language(g.words) for g in geometries]
|
||||
role_scores = [_score_role(g) for g in geometries]
|
||||
|
||||
logger.info(f"ClassifyColumns: language scores: "
|
||||
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
|
||||
logger.info(f"ClassifyColumns: role scores: "
|
||||
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
|
||||
|
||||
# --- Level 0: Dictionary detection ---
|
||||
dict_signals = _score_dictionary_signals(
|
||||
geometries,
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
)
|
||||
if dict_signals["is_dictionary"]:
|
||||
regions = _classify_dictionary_columns(
|
||||
geometries, dict_signals, lang_scores, content_h,
|
||||
)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
|
||||
dict_signals["confidence"])
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 1: Content-based classification ---
|
||||
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 2: Position + language enhanced ---
|
||||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 1: Content-Based Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
role_scores: List[Dict[str, float]],
|
||||
content_w: int,
|
||||
content_h: int) -> Optional[List[PageRegion]]:
|
||||
"""Level 1: Classify columns purely by content analysis.
|
||||
|
||||
Requires clear language signals to distinguish EN/DE columns.
|
||||
Returns None if language signals are too weak.
|
||||
"""
|
||||
regions = []
|
||||
assigned = set()
|
||||
|
||||
# Step 1: Assign structural roles first (reference, marker)
|
||||
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
|
||||
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
|
||||
|
||||
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
|
||||
is_left_side = geom.x < left_20_threshold
|
||||
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
|
||||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=rs['reference'],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=rs['marker'],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif geom.width_ratio < 0.05 and not is_left_side:
|
||||
# Narrow column on the right side -> marker, not page_ref
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.8,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
|
||||
# Step 2: Among remaining columns, find EN and DE by language scores
|
||||
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
|
||||
for i in range(len(geometries)) if i not in assigned]
|
||||
|
||||
if len(remaining) < 2:
|
||||
# Not enough columns for EN/DE pair
|
||||
if len(remaining) == 1:
|
||||
i, geom, ls, rs = remaining[0]
|
||||
regions.append(PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.6,
|
||||
classification_method='content',
|
||||
))
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
# Check if we have enough language signal
|
||||
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
|
||||
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
|
||||
|
||||
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
|
||||
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
|
||||
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
|
||||
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
|
||||
if max_eng < 0.15 and max_deu < 0.15:
|
||||
# Both signals weak -- fall back to positional: left=EN, right=DE
|
||||
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
|
||||
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
|
||||
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
|
||||
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
|
||||
en_conf = 0.4
|
||||
de_conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||||
width=best_en[1].width, height=content_h,
|
||||
classification_confidence=en_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_en[0])
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||||
width=best_de[1].width, height=content_h,
|
||||
classification_confidence=de_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_de[0])
|
||||
|
||||
# Assign remaining as example
|
||||
for i, geom, ls, rs in remaining:
|
||||
if i not in assigned:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.4,
|
||||
classification_method='content',
|
||||
))
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
if not en_candidates or not de_candidates:
|
||||
# Language signals too weak for content-based classification
|
||||
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
|
||||
return None
|
||||
|
||||
# Pick the best EN and DE candidates
|
||||
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
||||
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
||||
|
||||
# Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
|
||||
# Example sentences contain English function words ("the", "a", "is") which inflate
|
||||
# the eng score of the Example column. When the best EN candidate sits to the RIGHT
|
||||
# of the DE column and there is another EN candidate to the LEFT, prefer the left one
|
||||
# -- it is almost certainly the real vocabulary column.
|
||||
if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
|
||||
left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
|
||||
if left_of_de:
|
||||
alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
|
||||
logger.info(
|
||||
f"ClassifyColumns: Level 1 position fix -- best EN col {best_en[0]} "
|
||||
f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
|
||||
f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
|
||||
best_en = alt_en
|
||||
|
||||
if best_en[0] == best_de[0]:
|
||||
# Same column scored highest for both -- ambiguous
|
||||
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
||||
return None
|
||||
|
||||
en_conf = best_en[2]['eng']
|
||||
de_conf = best_de[2]['deu']
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||||
width=best_en[1].width, height=content_h,
|
||||
classification_confidence=round(en_conf, 2),
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_en[0])
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||||
width=best_de[1].width, height=content_h,
|
||||
classification_confidence=round(de_conf, 2),
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_de[0])
|
||||
|
||||
# Step 3: Remaining columns -> example or text based on role scores
|
||||
for i, geom, ls, rs in remaining:
|
||||
if i in assigned:
|
||||
continue
|
||||
if rs['sentence'] > 0.4:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(rs['sentence'], 2),
|
||||
classification_method='content',
|
||||
))
|
||||
else:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='content',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Position-based column type classification for OCR layout analysis.
|
||||
|
||||
Contains Level 2 and Level 3 classification functions:
|
||||
Level 2 – _classify_by_position_enhanced: Position + language confirmation
|
||||
Level 3 – _classify_by_position_fallback: Pure positional (no regression)
|
||||
|
||||
Extracted from cv_layout_classify.py during file-size split.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import ColumnGeometry, PageRegion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 2: Position-Enhanced Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
content_w: int,
|
||||
content_h: int) -> Optional[List[PageRegion]]:
|
||||
"""Level 2: Position-based rules enhanced with language confirmation.
|
||||
|
||||
Uses the old positional heuristics but confirms EN/DE assignment
|
||||
with language scores (swapping if needed).
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_20_threshold = first_x + content_w * 0.20
|
||||
|
||||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
|
||||
g0 = geometries[0]
|
||||
ls0 = lang_scores[0]
|
||||
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
|
||||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
classification_confidence=0.8,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(0)
|
||||
|
||||
# Rule 2: Narrow columns with few words -> marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.7,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(i)
|
||||
|
||||
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
|
||||
if len(untyped) >= 3:
|
||||
last_idx = untyped[-1]
|
||||
geom = geometries[last_idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.7,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(last_idx)
|
||||
|
||||
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
|
||||
if len(untyped) >= 2:
|
||||
idx_a = untyped[0]
|
||||
idx_b = untyped[1]
|
||||
ls_a = lang_scores[idx_a]
|
||||
ls_b = lang_scores[idx_b]
|
||||
|
||||
# Default: first=EN, second=DE (old behavior)
|
||||
en_idx, de_idx = idx_a, idx_b
|
||||
conf = 0.7
|
||||
|
||||
# Swap if language signals clearly indicate the opposite
|
||||
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
|
||||
en_idx, de_idx = idx_b, idx_a
|
||||
conf = 0.85
|
||||
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||||
width=geometries[en_idx].width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||||
width=geometries[de_idx].width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped = untyped[2:]
|
||||
elif len(untyped) == 1:
|
||||
idx = untyped[0]
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped = []
|
||||
|
||||
# Remaining -> example
|
||||
for idx in untyped:
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 3: Position Fallback Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
content_h: int) -> List[PageRegion]:
|
||||
"""Level 3: Pure position-based fallback (identical to old code).
|
||||
|
||||
Guarantees no regression from the previous behavior.
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_20_threshold = first_x + content_w * 0.20
|
||||
|
||||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
|
||||
g0 = geometries[0]
|
||||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(0)
|
||||
|
||||
# Rule 2: Narrow + few words -> marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(i)
|
||||
|
||||
# Rule 3: Rightmost remaining -> example (if 3+)
|
||||
if len(untyped) >= 3:
|
||||
last_idx = untyped[-1]
|
||||
geom = geometries[last_idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(last_idx)
|
||||
|
||||
# Rule 4: First remaining -> EN, second -> DE
|
||||
if len(untyped) >= 2:
|
||||
en_idx = untyped[0]
|
||||
de_idx = untyped[1]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||||
width=geometries[en_idx].width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||||
width=geometries[de_idx].width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped = untyped[2:]
|
||||
elif len(untyped) == 1:
|
||||
idx = untyped[0]
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped = []
|
||||
|
||||
for idx in untyped:
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
@@ -0,0 +1,458 @@
|
||||
"""
|
||||
Post-processing refinements for column geometry.
|
||||
|
||||
Extracted from cv_layout_columns.py — contains:
|
||||
- _detect_sub_columns() (sub-column detection via left-edge alignment)
|
||||
- _split_broad_columns() (broad column splitting via word-coverage gaps)
|
||||
- expand_narrow_columns() (narrow column expansion into whitespace)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import statistics
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _detect_sub_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
top_y: int = 0,
|
||||
header_y: Optional[int] = None,
|
||||
footer_y: Optional[int] = None,
|
||||
_edge_tolerance: int = 8,
|
||||
_min_col_start_ratio: float = 0.10,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split columns that contain internal sub-columns based on left-edge alignment.
|
||||
|
||||
For each column, clusters word left-edges into alignment bins (within
|
||||
``_edge_tolerance`` px). The leftmost bin whose word count reaches
|
||||
``_min_col_start_ratio`` of the column total is treated as the true column
|
||||
start. Any words to the left of that bin form a sub-column, provided they
|
||||
number >= 2 and < 35 % of total.
|
||||
|
||||
Word ``left`` values are relative to the content ROI (offset by *left_x*),
|
||||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||||
bridges the two coordinate systems.
|
||||
|
||||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||||
in header/footer regions are excluded from alignment clustering to avoid
|
||||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||||
values are relative to *top_y*.
|
||||
|
||||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||||
"""
|
||||
if content_w <= 0:
|
||||
return geometries
|
||||
|
||||
result: List[ColumnGeometry] = []
|
||||
for geo in geometries:
|
||||
# Only consider wide-enough columns with enough words
|
||||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Collect left-edges of confident words, excluding header/footer
|
||||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||||
|
||||
confident = [w for w in geo.words
|
||||
if w.get('conf', 0) >= 30
|
||||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(confident) < 3:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Cluster left-edges into alignment bins ---
|
||||
sorted_edges = sorted(w['left'] for w in confident)
|
||||
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
|
||||
cur = [sorted_edges[0]]
|
||||
for i in range(1, len(sorted_edges)):
|
||||
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
|
||||
cur.append(sorted_edges[i])
|
||||
else:
|
||||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||||
cur = [sorted_edges[i]]
|
||||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||||
|
||||
# --- Find the leftmost bin qualifying as a real column start ---
|
||||
total = len(confident)
|
||||
min_count = max(3, int(total * _min_col_start_ratio))
|
||||
col_start_bin = None
|
||||
for b in bins:
|
||||
if b[1] >= min_count:
|
||||
col_start_bin = b
|
||||
break
|
||||
|
||||
if col_start_bin is None:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Words to the left of the column-start bin are sub-column candidates
|
||||
split_threshold = col_start_bin[2] - _edge_tolerance
|
||||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||||
|
||||
# Count only body words (excluding header/footer) for the threshold check
|
||||
# so that header/footer words don't artificially trigger a split.
|
||||
sub_body = [w for w in sub_words
|
||||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Guard against inline markers (bullet points, numbering) ---
|
||||
# Bullet points like "1.", "2.", "•", "-" sit close to the main
|
||||
# column text and are part of the cell, not a separate column.
|
||||
# Only split if the horizontal gap between the rightmost sub-word
|
||||
# and the main column start is large enough.
|
||||
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
|
||||
gap_to_main = col_start_bin[2] - max_sub_right # px gap
|
||||
median_heights = [w.get('height', 20) for w in confident]
|
||||
med_h = statistics.median(median_heights) if median_heights else 20
|
||||
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
|
||||
if gap_to_main < min_gap:
|
||||
logger.debug(
|
||||
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
|
||||
"(likely inline markers, not a sub-column)",
|
||||
geo.index, gap_to_main, min_gap)
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Build two sub-column geometries ---
|
||||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||||
# Convert the split position from relative to absolute coordinates.
|
||||
max_sub_left = max(w['left'] for w in sub_words)
|
||||
split_rel = (max_sub_left + col_start_bin[2]) // 2
|
||||
split_abs = split_rel + left_x
|
||||
|
||||
sub_x = geo.x
|
||||
sub_width = split_abs - geo.x
|
||||
main_x = split_abs
|
||||
main_width = (geo.x + geo.width) - split_abs
|
||||
|
||||
if sub_width <= 0 or main_width <= 0:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
sub_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=sub_x,
|
||||
y=geo.y,
|
||||
width=sub_width,
|
||||
height=geo.height,
|
||||
word_count=len(sub_words),
|
||||
words=sub_words,
|
||||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
main_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=main_x,
|
||||
y=geo.y,
|
||||
width=main_width,
|
||||
height=geo.height,
|
||||
word_count=len(main_words),
|
||||
words=main_words,
|
||||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
|
||||
result.append(sub_geo)
|
||||
result.append(main_geo)
|
||||
|
||||
logger.info(
|
||||
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
|
||||
f"(rel={split_rel}), sub={len(sub_words)} words, "
|
||||
f"main={len(main_words)} words, "
|
||||
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
|
||||
)
|
||||
|
||||
# Re-index by left-to-right order
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_broad_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
_broad_threshold: float = 0.35,
|
||||
_min_gap_px: int = 15,
|
||||
_min_words_per_split: int = 5,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split overly broad columns that contain two language blocks (EN+DE).
|
||||
|
||||
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
|
||||
words inside each broad column, finds the largest horizontal gap, and splits
|
||||
the column at that gap.
|
||||
|
||||
Args:
|
||||
geometries: Column geometries from _detect_sub_columns.
|
||||
content_w: Width of the content area in pixels.
|
||||
left_x: Left edge of content ROI in absolute image coordinates.
|
||||
_broad_threshold: Minimum width_ratio to consider a column "broad".
|
||||
_min_gap_px: Minimum gap width (pixels) to trigger a split.
|
||||
_min_words_per_split: Both halves must have at least this many words.
|
||||
|
||||
Returns:
|
||||
Updated list of ColumnGeometry (possibly with more columns).
|
||||
"""
|
||||
result: List[ColumnGeometry] = []
|
||||
|
||||
logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
|
||||
|
||||
for geo in geometries:
|
||||
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build word-coverage array (per pixel within column)
|
||||
col_left_rel = geo.x - left_x # column left in content-relative coords
|
||||
coverage = np.zeros(geo.width, dtype=np.float32)
|
||||
|
||||
for wd in geo.words:
|
||||
# wd['left'] is relative to left_x (content ROI)
|
||||
wl = wd['left'] - col_left_rel
|
||||
wr = wl + wd.get('width', 0)
|
||||
wl = max(0, int(wl))
|
||||
wr = min(geo.width, int(wr))
|
||||
if wr > wl:
|
||||
coverage[wl:wr] += 1.0
|
||||
|
||||
# Light smoothing (kernel=3px) to avoid noise
|
||||
if len(coverage) > 3:
|
||||
kernel = np.ones(3, dtype=np.float32) / 3.0
|
||||
coverage = np.convolve(coverage, kernel, mode='same')
|
||||
|
||||
# Normalise to [0, 1]
|
||||
cmax = coverage.max()
|
||||
if cmax > 0:
|
||||
coverage /= cmax
|
||||
|
||||
# Find INTERNAL gaps where coverage < 0.5
|
||||
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
|
||||
low_mask = coverage < 0.5
|
||||
all_gaps = []
|
||||
_gs = None
|
||||
for px in range(len(low_mask)):
|
||||
if low_mask[px]:
|
||||
if _gs is None:
|
||||
_gs = px
|
||||
else:
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, px, px - _gs))
|
||||
_gs = None
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||||
|
||||
# Filter: only internal gaps (not touching column edges)
|
||||
_edge_margin = 10 # pixels from edge to ignore
|
||||
internal_gaps = [g for g in all_gaps
|
||||
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
|
||||
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
|
||||
|
||||
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
|
||||
f"{[g for g in all_gaps if g[2] >= 5]}, "
|
||||
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
|
||||
f"best={best_gap}")
|
||||
|
||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
gap_center = (best_gap[0] + best_gap[1]) // 2
|
||||
|
||||
# Split words by midpoint relative to gap
|
||||
left_words = []
|
||||
right_words = []
|
||||
for wd in geo.words:
|
||||
wl = wd['left'] - col_left_rel
|
||||
mid = wl + wd.get('width', 0) / 2.0
|
||||
if mid < gap_center:
|
||||
left_words.append(wd)
|
||||
else:
|
||||
right_words.append(wd)
|
||||
|
||||
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build two new ColumnGeometry objects
|
||||
split_x_abs = geo.x + gap_center
|
||||
left_w = gap_center
|
||||
right_w = geo.width - gap_center
|
||||
|
||||
left_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=geo.x,
|
||||
y=geo.y,
|
||||
width=left_w,
|
||||
height=geo.height,
|
||||
word_count=len(left_words),
|
||||
words=left_words,
|
||||
width_ratio=left_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
right_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=split_x_abs,
|
||||
y=geo.y,
|
||||
width=right_w,
|
||||
height=geo.height,
|
||||
word_count=len(right_words),
|
||||
words=right_words,
|
||||
width_ratio=right_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
|
||||
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
|
||||
f"left={len(left_words)} words (w={left_w}), "
|
||||
f"right={len(right_words)} words (w={right_w})"
|
||||
)
|
||||
|
||||
result.append(left_geo)
|
||||
result.append(right_geo)
|
||||
|
||||
# Re-index left-to-right
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def expand_narrow_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int,
|
||||
word_dicts: List[Dict],
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Expand narrow columns into adjacent whitespace gaps.
|
||||
|
||||
Narrow columns (marker, page_ref, < 10% content width) often lose
|
||||
content at image edges due to residual shear. This expands them toward
|
||||
the neighbouring column, but never past 40% of the gap or past the
|
||||
nearest word in the neighbour.
|
||||
|
||||
Must be called AFTER _detect_sub_columns() so that sub-column splits
|
||||
(which create the narrowest columns) have already happened.
|
||||
"""
|
||||
_NARROW_THRESHOLD_PCT = 10.0
|
||||
_MIN_WORD_MARGIN = 4
|
||||
|
||||
if len(geometries) < 2:
|
||||
return geometries
|
||||
|
||||
logger.info("ExpandNarrowCols: input %d cols: %s",
|
||||
len(geometries),
|
||||
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
|
||||
for i, g in enumerate(geometries)])
|
||||
|
||||
for i, g in enumerate(geometries):
|
||||
col_pct = g.width / content_w * 100 if content_w > 0 else 100
|
||||
if col_pct >= _NARROW_THRESHOLD_PCT:
|
||||
continue
|
||||
|
||||
expanded = False
|
||||
orig_pct = col_pct
|
||||
|
||||
# --- try expanding to the LEFT ---
|
||||
if i > 0:
|
||||
left_nb = geometries[i - 1]
|
||||
# Gap can be 0 if sub-column split created adjacent columns.
|
||||
# In that case, look at where the neighbor's rightmost words
|
||||
# actually are — there may be unused space we can claim.
|
||||
nb_words_right = [wd['left'] + wd.get('width', 0)
|
||||
for wd in left_nb.words]
|
||||
if nb_words_right:
|
||||
rightmost_word_abs = left_x + max(nb_words_right)
|
||||
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
|
||||
else:
|
||||
# No words in neighbor → we can take up to neighbor's start
|
||||
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
|
||||
|
||||
if safe_left_abs < g.x:
|
||||
g.width += (g.x - safe_left_abs)
|
||||
g.x = safe_left_abs
|
||||
expanded = True
|
||||
|
||||
# --- try expanding to the RIGHT ---
|
||||
if i + 1 < len(geometries):
|
||||
right_nb = geometries[i + 1]
|
||||
nb_words_left = [wd['left'] for wd in right_nb.words]
|
||||
if nb_words_left:
|
||||
leftmost_word_abs = left_x + min(nb_words_left)
|
||||
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
|
||||
else:
|
||||
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
|
||||
|
||||
cur_right = g.x + g.width
|
||||
if safe_right_abs > cur_right:
|
||||
g.width = safe_right_abs - g.x
|
||||
expanded = True
|
||||
|
||||
if expanded:
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
g.words = [wd for wd in word_dicts
|
||||
if col_left_rel <= wd['left'] < col_right_rel]
|
||||
g.word_count = len(g.words)
|
||||
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
|
||||
logger.info(
|
||||
"ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
|
||||
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
|
||||
|
||||
# --- Shrink overlapping neighbors to match new boundaries ---
|
||||
# Left neighbor: its right edge must not exceed our new left edge
|
||||
if i > 0:
|
||||
left_nb = geometries[i - 1]
|
||||
nb_right = left_nb.x + left_nb.width
|
||||
if nb_right > g.x:
|
||||
left_nb.width = g.x - left_nb.x
|
||||
if left_nb.width < 0:
|
||||
left_nb.width = 0
|
||||
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
|
||||
# Re-assign words
|
||||
nb_left_rel = left_nb.x - left_x
|
||||
nb_right_rel = nb_left_rel + left_nb.width
|
||||
left_nb.words = [wd for wd in word_dicts
|
||||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||||
left_nb.word_count = len(left_nb.words)
|
||||
|
||||
# Right neighbor: its left edge must not be before our new right edge
|
||||
if i + 1 < len(geometries):
|
||||
right_nb = geometries[i + 1]
|
||||
my_right = g.x + g.width
|
||||
if right_nb.x < my_right:
|
||||
old_right_edge = right_nb.x + right_nb.width
|
||||
right_nb.x = my_right
|
||||
right_nb.width = old_right_edge - right_nb.x
|
||||
if right_nb.width < 0:
|
||||
right_nb.width = 0
|
||||
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
|
||||
# Re-assign words
|
||||
nb_left_rel = right_nb.x - left_x
|
||||
nb_right_rel = nb_left_rel + right_nb.width
|
||||
right_nb.words = [wd for wd in word_dicts
|
||||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||||
right_nb.word_count = len(right_nb.words)
|
||||
|
||||
return geometries
|
||||
@@ -0,0 +1,589 @@
|
||||
"""
|
||||
Core column detection: gap-based geometry and clustering fallback.
|
||||
|
||||
Extracted from the original cv_layout_columns.py — contains:
|
||||
- _detect_columns_by_clustering() (fallback clustering)
|
||||
- _build_geometries_from_starts() (geometry construction)
|
||||
- detect_column_geometry() (main column detection)
|
||||
|
||||
Post-processing (sub-columns, broad-column split, narrow expansion)
|
||||
lives in cv_layout_column_refine.py.
|
||||
Legacy projection-profile layout lives in cv_layout_analyze.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
from cv_layout_detection import _find_content_bounds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
|
||||
# =============================================================================
|
||||
|
||||
# --- Phase A: Geometry Detection ---
|
||||
|
||||
def _detect_columns_by_clustering(
|
||||
word_dicts: List[Dict],
|
||||
left_edges: List[int],
|
||||
edge_word_indices: List[int],
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
||||
"""Fallback: detect columns by clustering left-aligned word positions.
|
||||
|
||||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||||
"""
|
||||
tolerance = max(10, int(content_w * 0.01))
|
||||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||||
|
||||
clusters = []
|
||||
cluster_widxs = []
|
||||
cur_edges = [sorted_pairs[0][0]]
|
||||
cur_widxs = [sorted_pairs[0][1]]
|
||||
for edge, widx in sorted_pairs[1:]:
|
||||
if edge - cur_edges[-1] <= tolerance:
|
||||
cur_edges.append(edge)
|
||||
cur_widxs.append(widx)
|
||||
else:
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
cur_edges = [edge]
|
||||
cur_widxs = [widx]
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
|
||||
MIN_Y_COVERAGE_PRIMARY = 0.30
|
||||
MIN_Y_COVERAGE_SECONDARY = 0.15
|
||||
MIN_WORDS_SECONDARY = 5
|
||||
|
||||
cluster_infos = []
|
||||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||||
if len(c_edges) < 2:
|
||||
continue
|
||||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||||
y_span = max(y_positions) - min(y_positions)
|
||||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||||
cluster_infos.append({
|
||||
'mean_x': int(np.mean(c_edges)),
|
||||
'count': len(c_edges),
|
||||
'min_edge': min(c_edges),
|
||||
'max_edge': max(c_edges),
|
||||
'y_min': min(y_positions),
|
||||
'y_max': max(y_positions),
|
||||
'y_coverage': y_coverage,
|
||||
})
|
||||
|
||||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||||
primary_set = set(id(c) for c in primary)
|
||||
secondary = [c for c in cluster_infos
|
||||
if id(c) not in primary_set
|
||||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||||
|
||||
if len(significant) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
||||
return None
|
||||
|
||||
merge_distance = max(30, int(content_w * 0.06))
|
||||
merged = [significant[0].copy()]
|
||||
for s in significant[1:]:
|
||||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||||
prev = merged[-1]
|
||||
total = prev['count'] + s['count']
|
||||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||||
prev['mean_x'] = avg_x
|
||||
prev['count'] = total
|
||||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||||
else:
|
||||
merged.append(s.copy())
|
||||
|
||||
if len(merged) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
||||
return None
|
||||
|
||||
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
||||
|
||||
margin_px = max(6, int(content_w * 0.003))
|
||||
return _build_geometries_from_starts(
|
||||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
||||
)
|
||||
|
||||
|
||||
def _build_geometries_from_starts(
|
||||
col_starts: List[Tuple[int, int]],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
||||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||||
geometries = []
|
||||
for i, (start_x, count) in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
col_width = col_starts[i + 1][0] - start_x
|
||||
else:
|
||||
col_width = right_x - start_x
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
|
||||
geometries.append(ColumnGeometry(
|
||||
index=i,
|
||||
x=start_x,
|
||||
y=top_y,
|
||||
width=col_width,
|
||||
height=content_h,
|
||||
word_count=len(col_words),
|
||||
words=col_words,
|
||||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||||
))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
|
||||
|
||||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
||||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||||
|
||||
Phase A of the two-phase column detection. Uses vertical projection
|
||||
profiles to find whitespace gaps between columns, then validates that
|
||||
no gap cuts through a word bounding box.
|
||||
|
||||
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized grayscale image for layout analysis.
|
||||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||||
|
||||
Returns:
|
||||
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
or None if detection fails entirely.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# --- Step 1: Find content bounds ---
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||
left_x, right_x = 0, w
|
||||
top_y, bottom_y = 0, h
|
||||
content_w, content_h = w, h
|
||||
|
||||
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||||
|
||||
# --- Step 2: Get word bounding boxes from Tesseract ---
|
||||
# Crop from left_x to full image width (not right_x) so words at the right
|
||||
# edge of the last column are included even if they extend past the detected
|
||||
# content boundary (right_x).
|
||||
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
|
||||
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||||
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
||||
except Exception as e:
|
||||
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
||||
return None
|
||||
|
||||
word_dicts = []
|
||||
left_edges = []
|
||||
edge_word_indices = []
|
||||
n_words = len(data['text'])
|
||||
for i in range(n_words):
|
||||
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||||
text = str(data['text'][i]).strip()
|
||||
if conf < 30 or not text:
|
||||
continue
|
||||
lx = int(data['left'][i])
|
||||
ty = int(data['top'][i])
|
||||
bw = int(data['width'][i])
|
||||
bh = int(data['height'][i])
|
||||
left_edges.append(lx)
|
||||
edge_word_indices.append(len(word_dicts))
|
||||
word_dicts.append({
|
||||
'text': text, 'conf': conf,
|
||||
'left': lx, 'top': ty, 'width': bw, 'height': bh,
|
||||
})
|
||||
|
||||
if len(left_edges) < 5:
|
||||
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
|
||||
return None
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||||
|
||||
# --- Step 2b: Segment by sub-headers ---
|
||||
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
|
||||
# text bands that pollute the vertical projection. We detect large
|
||||
# horizontal gaps (= whitespace rows separating sections) and use only
|
||||
# the tallest content segment for the projection. This makes column
|
||||
# detection immune to sub-headers, illustrations, and section dividers.
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
h_proj_row = np.sum(content_strip, axis=1).astype(float)
|
||||
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
|
||||
|
||||
# Find horizontal gaps (near-empty rows)
|
||||
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
|
||||
h_in_gap = h_proj_row_norm < H_GAP_THRESH
|
||||
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
|
||||
|
||||
h_gaps: List[Tuple[int, int]] = []
|
||||
h_gap_start = None
|
||||
for y_idx in range(len(h_in_gap)):
|
||||
if h_in_gap[y_idx]:
|
||||
if h_gap_start is None:
|
||||
h_gap_start = y_idx
|
||||
else:
|
||||
if h_gap_start is not None:
|
||||
if y_idx - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, y_idx))
|
||||
h_gap_start = None
|
||||
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, len(h_in_gap)))
|
||||
|
||||
# Identify "large" gaps (significantly bigger than median) that indicate
|
||||
# section boundaries (sub-headers, chapter titles).
|
||||
if len(h_gaps) >= 3:
|
||||
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
|
||||
median_gap_h = gap_sizes[len(gap_sizes) // 2]
|
||||
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
|
||||
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
|
||||
else:
|
||||
large_gaps = h_gaps
|
||||
|
||||
# Build content segments between large gaps and pick the tallest
|
||||
seg_boundaries = [0]
|
||||
for gs, ge in large_gaps:
|
||||
seg_boundaries.append(gs)
|
||||
seg_boundaries.append(ge)
|
||||
seg_boundaries.append(content_h)
|
||||
|
||||
segments = []
|
||||
for i in range(0, len(seg_boundaries) - 1, 2):
|
||||
seg_top = seg_boundaries[i]
|
||||
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
|
||||
seg_height = seg_bot - seg_top
|
||||
if seg_height > 20: # ignore tiny fragments
|
||||
segments.append((seg_top, seg_bot, seg_height))
|
||||
|
||||
if segments:
|
||||
segments.sort(key=lambda s: s[2], reverse=True)
|
||||
best_seg = segments[0]
|
||||
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
|
||||
effective_h = best_seg[2]
|
||||
if len(segments) > 1:
|
||||
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
|
||||
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
|
||||
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
|
||||
else:
|
||||
proj_strip = content_strip
|
||||
effective_h = content_h
|
||||
|
||||
# --- Step 3: Vertical projection profile ---
|
||||
v_proj = np.sum(proj_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection to avoid noise-induced micro-gaps
|
||||
kernel_size = max(5, content_w // 80)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1 # keep odd for symmetry
|
||||
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# --- Step 4: Find whitespace gaps ---
|
||||
# Threshold: areas with very little ink density are gaps
|
||||
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.005)
|
||||
|
||||
in_gap = v_smooth < gap_threshold
|
||||
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
||||
|
||||
# Collect contiguous gap regions
|
||||
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
||||
gap_start = None
|
||||
for x in range(len(in_gap)):
|
||||
if in_gap[x]:
|
||||
if gap_start is None:
|
||||
gap_start = x
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_width = x - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, x))
|
||||
gap_start = None
|
||||
# Handle gap at the right edge
|
||||
if gap_start is not None:
|
||||
gap_width = len(in_gap) - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||
f"min_width={MIN_GAP_WIDTH}px): "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||||
|
||||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||||
# When using a segment for projection, only validate against words
|
||||
# inside that segment — words from sub-headers or other sections
|
||||
# would incorrectly overlap with real column gaps.
|
||||
if segments and len(segments) > 1:
|
||||
seg_top_abs = best_seg[0] # relative to content strip
|
||||
seg_bot_abs = best_seg[1]
|
||||
segment_words = [wd for wd in word_dicts
|
||||
if wd['top'] >= seg_top_abs
|
||||
and wd['top'] + wd['height'] <= seg_bot_abs]
|
||||
logger.info(f"ColumnGeometry: filtering words to segment: "
|
||||
f"{len(segment_words)}/{len(word_dicts)} words")
|
||||
else:
|
||||
segment_words = word_dicts
|
||||
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
# Check if any word overlaps with this gap region
|
||||
overlapping = False
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
overlapping = True
|
||||
break
|
||||
|
||||
if not overlapping:
|
||||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||
else:
|
||||
# Try to shift the gap to avoid the overlapping word(s)
|
||||
# Find the tightest word boundaries within the gap region
|
||||
min_word_left = content_w
|
||||
max_word_right = 0
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
min_word_left = min(min_word_left, word_left)
|
||||
max_word_right = max(max_word_right, word_right)
|
||||
|
||||
# Try gap before the overlapping words
|
||||
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((gap_start_rel, min_word_left))
|
||||
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
||||
# Try gap after the overlapping words
|
||||
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((max_word_right, gap_end_rel))
|
||||
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
||||
else:
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"discarded (word overlap, no room to shift)")
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
||||
|
||||
# --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
|
||||
# When pixel-based projection fails (e.g. due to illustrations or colored
|
||||
# bands), use word bounding boxes to find clear vertical gaps. This is
|
||||
# immune to decorative graphics that Tesseract doesn't recognise as words.
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
|
||||
word_coverage = np.zeros(content_w, dtype=np.int32)
|
||||
for wd in segment_words:
|
||||
wl = max(0, wd['left'])
|
||||
wr = min(wd['left'] + wd['width'], content_w)
|
||||
if wr > wl:
|
||||
word_coverage[wl:wr] += 1
|
||||
|
||||
# Smooth slightly to bridge tiny 1-2px noise gaps between words
|
||||
wc_kernel = max(3, content_w // 300)
|
||||
if wc_kernel % 2 == 0:
|
||||
wc_kernel += 1
|
||||
wc_smooth = np.convolve(word_coverage.astype(float),
|
||||
np.ones(wc_kernel) / wc_kernel, mode='same')
|
||||
|
||||
wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
|
||||
WC_MIN_GAP = max(4, content_w // 300)
|
||||
|
||||
wc_gaps: List[Tuple[int, int]] = []
|
||||
wc_gap_start = None
|
||||
for x in range(len(wc_in_gap)):
|
||||
if wc_in_gap[x]:
|
||||
if wc_gap_start is None:
|
||||
wc_gap_start = x
|
||||
else:
|
||||
if wc_gap_start is not None:
|
||||
if x - wc_gap_start >= WC_MIN_GAP:
|
||||
wc_gaps.append((wc_gap_start, x))
|
||||
wc_gap_start = None
|
||||
if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
|
||||
wc_gaps.append((wc_gap_start, len(wc_in_gap)))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
|
||||
f"(min_width={WC_MIN_GAP}px): "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
|
||||
|
||||
if len(wc_gaps) >= 2:
|
||||
validated_gaps = wc_gaps
|
||||
|
||||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||||
return _detect_columns_by_clustering(
|
||||
word_dicts, left_edges, edge_word_indices,
|
||||
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
||||
)
|
||||
|
||||
# --- Step 7: Derive column boundaries from gaps ---
|
||||
# Sort gaps by position
|
||||
validated_gaps.sort(key=lambda g: g[0])
|
||||
|
||||
# Identify margin gaps (first and last) vs interior gaps
|
||||
# A margin gap touches the edge of the content area (within 2% tolerance)
|
||||
edge_tolerance = max(10, int(content_w * 0.02))
|
||||
|
||||
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
||||
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
||||
|
||||
# Interior gaps define column boundaries
|
||||
# Column starts at the end of a gap, ends at the start of the next gap
|
||||
col_starts = []
|
||||
|
||||
if is_left_margin:
|
||||
# First column starts after the left margin gap
|
||||
first_gap_end = validated_gaps[0][1]
|
||||
interior_gaps = validated_gaps[1:]
|
||||
else:
|
||||
# No left margin gap — first column starts at content left edge
|
||||
first_gap_end = 0
|
||||
interior_gaps = validated_gaps[:]
|
||||
|
||||
if is_right_margin:
|
||||
# Last gap is right margin — don't use it as column start
|
||||
interior_gaps_for_boundaries = interior_gaps[:-1]
|
||||
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
||||
else:
|
||||
interior_gaps_for_boundaries = interior_gaps
|
||||
right_boundary = content_w
|
||||
|
||||
# First column
|
||||
col_starts.append(left_x + first_gap_end)
|
||||
|
||||
# Columns between interior gaps
|
||||
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
||||
col_starts.append(left_x + gap_end_rel)
|
||||
|
||||
# Count words per column region (for logging)
|
||||
col_start_counts = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
next_start = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to full image width (w).
|
||||
# The page margin contains only white space — extending the OCR
|
||||
# crop to the image edge is safe and prevents text near the right
|
||||
# border from being cut off.
|
||||
next_start = w
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = next_start - left_x
|
||||
n_words_in_col = sum(1 for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel)
|
||||
col_start_counts.append((start_x, n_words_in_col))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
||||
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
||||
f"{col_start_counts}")
|
||||
|
||||
# --- Step 8: Build ColumnGeometry objects ---
|
||||
# Determine right edge for each column
|
||||
all_boundaries = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
end_x = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to full image width (w).
|
||||
end_x = w
|
||||
all_boundaries.append((start_x, end_x))
|
||||
|
||||
geometries = []
|
||||
for i, (start_x, end_x) in enumerate(all_boundaries):
|
||||
col_width = end_x - start_x
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
|
||||
geometries.append(ColumnGeometry(
|
||||
index=i,
|
||||
x=start_x,
|
||||
y=top_y,
|
||||
width=col_width,
|
||||
height=content_h,
|
||||
word_count=len(col_words),
|
||||
words=col_words,
|
||||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||||
))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
# --- Step 9: Filter phantom narrow columns ---
|
||||
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
||||
# columns (< 3% of content width) with zero or no words. These are not
|
||||
# real columns — remove them and close the gap between neighbors.
|
||||
min_real_col_w = max(20, int(content_w * 0.03))
|
||||
filtered_geoms = [g for g in geometries
|
||||
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
||||
if len(filtered_geoms) < len(geometries):
|
||||
n_removed = len(geometries) - len(filtered_geoms)
|
||||
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
||||
f"(width < {min_real_col_w}px and words < 3)")
|
||||
# Extend each remaining column to close gaps with its right neighbor
|
||||
for i, g in enumerate(filtered_geoms):
|
||||
if i + 1 < len(filtered_geoms):
|
||||
g.width = filtered_geoms[i + 1].x - g.x
|
||||
else:
|
||||
g.width = w - g.x
|
||||
g.index = i
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
g.words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
g.word_count = len(g.words)
|
||||
geometries = filtered_geoms
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
@@ -0,0 +1,479 @@
|
||||
"""
|
||||
Document type detection, image preparation, content bounds, and header/footer detection.
|
||||
|
||||
Extracted from cv_layout.py — these are the "input-side" helpers that run before
|
||||
column/row geometry analysis.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
DocumentTypeResult,
|
||||
PageRegion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Document Type Detection
|
||||
# =============================================================================
|
||||
|
||||
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
|
||||
"""Detect whether the page is a vocab table, generic table, or full text.
|
||||
|
||||
Uses projection profiles and text density analysis — no OCR required.
|
||||
Runs in < 2 seconds.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized grayscale image (for projection profiles).
|
||||
img_bgr: BGR color image.
|
||||
|
||||
Returns:
|
||||
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
|
||||
"""
|
||||
if ocr_img is None or ocr_img.size == 0:
|
||||
return DocumentTypeResult(
|
||||
doc_type='full_text', confidence=0.5, pipeline='full_page',
|
||||
skip_steps=['columns', 'rows'],
|
||||
features={'error': 'empty image'},
|
||||
)
|
||||
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# --- 1. Vertical projection profile → detect column gaps ---
|
||||
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
|
||||
# Invert: dark pixels on white background → high values = text.
|
||||
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
|
||||
|
||||
# Smooth the profile to avoid noise spikes
|
||||
kernel_size = max(3, w // 100)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Find significant vertical gaps (columns of near-zero text density)
|
||||
# A gap must be at least 1% of image width and have < 5% of max density
|
||||
max_density = max(vert_smooth.max(), 1)
|
||||
gap_threshold = max_density * 0.05
|
||||
min_gap_width = max(5, w // 100)
|
||||
|
||||
in_gap = False
|
||||
gap_count = 0
|
||||
gap_start = 0
|
||||
vert_gaps = []
|
||||
|
||||
for x in range(w):
|
||||
if vert_smooth[x] < gap_threshold:
|
||||
if not in_gap:
|
||||
in_gap = True
|
||||
gap_start = x
|
||||
else:
|
||||
if in_gap:
|
||||
gap_width = x - gap_start
|
||||
if gap_width >= min_gap_width:
|
||||
gap_count += 1
|
||||
vert_gaps.append((gap_start, x, gap_width))
|
||||
in_gap = False
|
||||
|
||||
# Filter out margin gaps (within 10% of image edges)
|
||||
margin_threshold = w * 0.10
|
||||
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
|
||||
internal_gap_count = len(internal_gaps)
|
||||
|
||||
# --- 2. Horizontal projection profile → detect row gaps ---
|
||||
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
|
||||
h_kernel = max(3, h // 200)
|
||||
if h_kernel % 2 == 0:
|
||||
h_kernel += 1
|
||||
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
|
||||
|
||||
h_max = max(horiz_smooth.max(), 1)
|
||||
h_gap_threshold = h_max * 0.05
|
||||
min_row_gap = max(3, h // 200)
|
||||
|
||||
row_gap_count = 0
|
||||
in_gap = False
|
||||
for y in range(h):
|
||||
if horiz_smooth[y] < h_gap_threshold:
|
||||
if not in_gap:
|
||||
in_gap = True
|
||||
gap_start = y
|
||||
else:
|
||||
if in_gap:
|
||||
if y - gap_start >= min_row_gap:
|
||||
row_gap_count += 1
|
||||
in_gap = False
|
||||
|
||||
# --- 3. Text density distribution (4×4 grid) ---
|
||||
grid_rows, grid_cols = 4, 4
|
||||
cell_h, cell_w = h // grid_rows, w // grid_cols
|
||||
densities = []
|
||||
for gr in range(grid_rows):
|
||||
for gc in range(grid_cols):
|
||||
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
|
||||
gc * cell_w:(gc + 1) * cell_w]
|
||||
if cell.size > 0:
|
||||
d = float(np.count_nonzero(cell < 128)) / cell.size
|
||||
densities.append(d)
|
||||
|
||||
density_std = float(np.std(densities)) if densities else 0
|
||||
density_mean = float(np.mean(densities)) if densities else 0
|
||||
|
||||
features = {
|
||||
'vertical_gaps': gap_count,
|
||||
'internal_vertical_gaps': internal_gap_count,
|
||||
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
|
||||
'row_gaps': row_gap_count,
|
||||
'density_mean': round(density_mean, 4),
|
||||
'density_std': round(density_std, 4),
|
||||
'image_size': (w, h),
|
||||
}
|
||||
|
||||
# --- 4. Decision tree ---
|
||||
# Use internal_gap_count (excludes margin gaps) for column detection.
|
||||
if internal_gap_count >= 2 and row_gap_count >= 5:
|
||||
# Multiple internal vertical gaps + many row gaps → table
|
||||
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
|
||||
return DocumentTypeResult(
|
||||
doc_type='vocab_table',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
elif internal_gap_count >= 1 and row_gap_count >= 3:
|
||||
# Some internal structure, likely a table
|
||||
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
|
||||
return DocumentTypeResult(
|
||||
doc_type='generic_table',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
elif internal_gap_count == 0:
|
||||
# No internal column gaps → full text (regardless of density)
|
||||
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
|
||||
return DocumentTypeResult(
|
||||
doc_type='full_text',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='full_page',
|
||||
skip_steps=['columns', 'rows'],
|
||||
features=features,
|
||||
)
|
||||
else:
|
||||
# Ambiguous — default to vocab_table (most common use case)
|
||||
return DocumentTypeResult(
|
||||
doc_type='vocab_table',
|
||||
confidence=0.5,
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Image Creation (Dual Image Preparation)
|
||||
# =============================================================================
|
||||
|
||||
def create_ocr_image(img: np.ndarray) -> np.ndarray:
|
||||
"""Create a binarized image optimized for Tesseract OCR.
|
||||
|
||||
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
|
||||
|
||||
Args:
|
||||
img: BGR image.
|
||||
|
||||
Returns:
|
||||
Binary image (white text on black background inverted to black on white).
|
||||
"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Background normalization: divide by blurred version
|
||||
bg = cv2.GaussianBlur(gray, (51, 51), 0)
|
||||
normalized = cv2.divide(gray, bg, scale=255)
|
||||
|
||||
# Adaptive binarization
|
||||
binary = cv2.adaptiveThreshold(
|
||||
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, 31, 10
|
||||
)
|
||||
|
||||
# Light denoise
|
||||
denoised = cv2.medianBlur(binary, 3)
|
||||
|
||||
return denoised
|
||||
|
||||
|
||||
def create_layout_image(img: np.ndarray) -> np.ndarray:
|
||||
"""Create a CLAHE-enhanced grayscale image for layout analysis.
|
||||
|
||||
Args:
|
||||
img: BGR image.
|
||||
|
||||
Returns:
|
||||
Enhanced grayscale image.
|
||||
"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(gray)
|
||||
return enhanced
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Content Bounds Detection
|
||||
# =============================================================================
|
||||
|
||||
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
|
||||
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
|
||||
out = mask.copy()
|
||||
n = len(out)
|
||||
i = 0
|
||||
while i < n:
|
||||
if out[i]:
|
||||
start = i
|
||||
while i < n and out[i]:
|
||||
i += 1
|
||||
if (i - start) < min_width:
|
||||
out[start:i] = False
|
||||
else:
|
||||
i += 1
|
||||
return out
|
||||
|
||||
|
||||
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
|
||||
"""Find the bounding box of actual text content (excluding page margins).
|
||||
|
||||
Scan artefacts (thin black lines at page edges) are filtered out by
|
||||
discarding contiguous projection runs narrower than 1 % of the image
|
||||
dimension (min 5 px).
|
||||
|
||||
Returns:
|
||||
Tuple of (left_x, right_x, top_y, bottom_y).
|
||||
"""
|
||||
h, w = inv.shape[:2]
|
||||
threshold = 0.005
|
||||
|
||||
# --- Horizontal projection for top/bottom ---
|
||||
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
|
||||
h_mask = h_proj > threshold
|
||||
min_h_run = max(5, h // 100)
|
||||
h_mask = _filter_narrow_runs(h_mask, min_h_run)
|
||||
|
||||
top_y = 0
|
||||
for y in range(h):
|
||||
if h_mask[y]:
|
||||
top_y = max(0, y - 5)
|
||||
break
|
||||
|
||||
bottom_y = h
|
||||
for y in range(h - 1, 0, -1):
|
||||
if h_mask[y]:
|
||||
bottom_y = min(h, y + 5)
|
||||
break
|
||||
|
||||
# --- Vertical projection for left/right margins ---
|
||||
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
|
||||
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
|
||||
v_mask = v_proj_norm > threshold
|
||||
min_v_run = max(5, w // 100)
|
||||
v_mask = _filter_narrow_runs(v_mask, min_v_run)
|
||||
|
||||
left_x = 0
|
||||
for x in range(w):
|
||||
if v_mask[x]:
|
||||
left_x = max(0, x - 2)
|
||||
break
|
||||
|
||||
right_x = w
|
||||
for x in range(w - 1, 0, -1):
|
||||
if v_mask[x]:
|
||||
right_x = min(w, x + 2)
|
||||
break
|
||||
|
||||
return left_x, right_x, top_y, bottom_y
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Header / Footer Detection
|
||||
# =============================================================================
|
||||
|
||||
def _detect_header_footer_gaps(
|
||||
inv: np.ndarray,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> Tuple[Optional[int], Optional[int]]:
|
||||
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||||
|
||||
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||||
20% that separate header/footer content from the main body.
|
||||
|
||||
Returns:
|
||||
(header_y, footer_y) — absolute y-coordinates.
|
||||
header_y = bottom edge of header region (None if no header detected).
|
||||
footer_y = top edge of footer region (None if no footer detected).
|
||||
"""
|
||||
HEADER_FOOTER_ZONE = 0.20
|
||||
GAP_MULTIPLIER = 2.0
|
||||
|
||||
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
|
||||
actual_h = min(inv.shape[0], img_h)
|
||||
roi = inv[:actual_h, :]
|
||||
h_proj = np.sum(roi, axis=1).astype(float)
|
||||
proj_w = roi.shape[1]
|
||||
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
|
||||
|
||||
# Step 2: Smoothing
|
||||
kernel_size = max(3, actual_h // 200)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Step 3: Gap threshold
|
||||
positive = h_smooth[h_smooth > 0]
|
||||
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.003)
|
||||
|
||||
in_gap = h_smooth < gap_threshold
|
||||
MIN_GAP_HEIGHT = max(3, actual_h // 500)
|
||||
|
||||
# Step 4: Collect contiguous gaps
|
||||
raw_gaps: List[Tuple[int, int]] = []
|
||||
gap_start: Optional[int] = None
|
||||
for y in range(len(in_gap)):
|
||||
if in_gap[y]:
|
||||
if gap_start is None:
|
||||
gap_start = y
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_height = y - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, y))
|
||||
gap_start = None
|
||||
if gap_start is not None:
|
||||
gap_height = len(in_gap) - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
if not raw_gaps:
|
||||
return None, None
|
||||
|
||||
# Step 5: Compute median gap size and large-gap threshold
|
||||
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||||
median_gap = float(np.median(gap_sizes))
|
||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||
|
||||
# Step 6: Find largest qualifying gap in header / footer zones
|
||||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||||
EDGE_MARGIN = max(5, actual_h // 400)
|
||||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||
|
||||
header_y: Optional[int] = None
|
||||
footer_y: Optional[int] = None
|
||||
|
||||
best_header_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if gs <= EDGE_MARGIN:
|
||||
continue # skip gaps touching the top edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||
if gap_size > best_header_size:
|
||||
best_header_size = gap_size
|
||||
header_y = ge # bottom edge of gap
|
||||
|
||||
best_footer_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if ge >= actual_h - EDGE_MARGIN:
|
||||
continue # skip gaps touching the bottom edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||
if gap_size > best_footer_size:
|
||||
best_footer_size = gap_size
|
||||
footer_y = gs # top edge of gap
|
||||
|
||||
if header_y is not None:
|
||||
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||||
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||||
if footer_y is not None:
|
||||
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||||
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||||
|
||||
return header_y, footer_y
|
||||
|
||||
|
||||
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||||
min_density: float = 0.005) -> bool:
|
||||
"""Check whether a horizontal strip contains meaningful ink.
|
||||
|
||||
Args:
|
||||
inv: Inverted binarized image (white-on-black).
|
||||
y_start: Top of the region (inclusive).
|
||||
y_end: Bottom of the region (exclusive).
|
||||
min_density: Fraction of white pixels required to count as content.
|
||||
|
||||
Returns:
|
||||
True if the region contains text/graphics, False if empty margin.
|
||||
"""
|
||||
if y_start >= y_end:
|
||||
return False
|
||||
strip = inv[y_start:y_end, :]
|
||||
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||||
return density > min_density
|
||||
|
||||
|
||||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||||
img_w: int, img_h: int,
|
||||
inv: Optional[np.ndarray] = None) -> None:
|
||||
"""Add header/footer/margin regions in-place.
|
||||
|
||||
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||||
to simple top_y/bottom_y bounds.
|
||||
|
||||
Region types depend on whether there is actual content (text/graphics):
|
||||
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||||
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||||
"""
|
||||
header_y: Optional[int] = None
|
||||
footer_y: Optional[int] = None
|
||||
|
||||
if inv is not None:
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||||
|
||||
# --- Top region ---
|
||||
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||||
top_y if top_y > 10 else None
|
||||
)
|
||||
if top_boundary is not None:
|
||||
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||||
rtype = 'header' if has_content else 'margin_top'
|
||||
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||||
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||||
f"(has_content={has_content})")
|
||||
|
||||
# --- Bottom region ---
|
||||
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||||
bottom_y if bottom_y < img_h - 10 else None
|
||||
)
|
||||
if bottom_boundary is not None:
|
||||
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||||
rtype = 'footer' if has_content else 'margin_bottom'
|
||||
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||||
height=img_h - bottom_boundary))
|
||||
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||||
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|
||||
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Layout analysis for OCR vocabulary pages — orchestration and re-exports.
|
||||
|
||||
This module provides the high-level entry points for layout analysis and
|
||||
re-exports all functions from sub-modules for backward compatibility.
|
||||
|
||||
Sub-modules:
|
||||
- cv_layout_detection: Document type detection, image creation, content bounds, header/footer
|
||||
- cv_layout_analyze: Legacy projection-based layout analysis
|
||||
- cv_layout_columns: Core column geometry detection
|
||||
- cv_layout_column_refine: Sub-column, broad-column, expand operations
|
||||
- cv_layout_rows: Row geometry detection
|
||||
- cv_layout_row_regularize: Row grid regularization
|
||||
- cv_layout_scoring: Language/role scoring, dictionary signals
|
||||
- cv_layout_classify: Column type classification (Phase B)
|
||||
- cv_layout_classify_position: Position-based classification fallbacks
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry, DetectedBox, PageRegion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Re-exports (backward compatibility) ───────────────────────────────────
|
||||
|
||||
from cv_layout_detection import ( # noqa: F401
|
||||
detect_document_type,
|
||||
create_ocr_image,
|
||||
create_layout_image,
|
||||
_filter_narrow_runs,
|
||||
_find_content_bounds,
|
||||
_detect_header_footer_gaps,
|
||||
_region_has_content,
|
||||
_add_header_footer,
|
||||
)
|
||||
|
||||
from cv_layout_analyze import ( # noqa: F401
|
||||
analyze_layout,
|
||||
)
|
||||
|
||||
from cv_layout_columns import ( # noqa: F401
|
||||
detect_column_geometry,
|
||||
_detect_columns_by_clustering,
|
||||
_build_geometries_from_starts,
|
||||
)
|
||||
|
||||
from cv_layout_column_refine import ( # noqa: F401
|
||||
_detect_sub_columns,
|
||||
_split_broad_columns,
|
||||
expand_narrow_columns,
|
||||
)
|
||||
|
||||
from cv_layout_rows import ( # noqa: F401
|
||||
detect_row_geometry,
|
||||
_build_rows_from_word_grouping,
|
||||
)
|
||||
|
||||
from cv_layout_row_regularize import ( # noqa: F401
|
||||
_regularize_row_grid,
|
||||
)
|
||||
|
||||
from cv_layout_scoring import ( # noqa: F401
|
||||
_score_language,
|
||||
_score_role,
|
||||
_score_dictionary_signals,
|
||||
_classify_dictionary_columns,
|
||||
)
|
||||
|
||||
from cv_layout_classify import ( # noqa: F401
|
||||
_build_margin_regions,
|
||||
positional_column_regions,
|
||||
classify_column_types,
|
||||
_classify_by_content,
|
||||
)
|
||||
|
||||
from cv_layout_classify_position import ( # noqa: F401
|
||||
_classify_by_position_enhanced,
|
||||
_classify_by_position_fallback,
|
||||
)
|
||||
|
||||
|
||||
# ── Orchestration Functions ───────────────────────────────────────────────
|
||||
|
||||
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
|
||||
"""Detect columns using two-phase approach: geometry then content classification.
|
||||
|
||||
Phase A: detect_column_geometry() — clustering word positions into columns.
|
||||
Phase B: classify_column_types() — content-based type assignment with fallback.
|
||||
|
||||
Falls back to projection-based analyze_layout() if geometry detection fails.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||||
|
||||
if result is None:
|
||||
logger.info("LayoutByWords: geometry detection failed, falling back to projection profiles")
|
||||
layout_img = create_layout_image(dewarped_bgr)
|
||||
return analyze_layout(layout_img, ocr_img)
|
||||
|
||||
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||||
content_w = right_x - left_x
|
||||
|
||||
header_y, footer_y = _detect_header_footer_gaps(_inv, w, h) if _inv is not None else (None, None)
|
||||
|
||||
geometries = _detect_sub_columns(geometries, content_w, left_x=left_x,
|
||||
top_y=top_y, header_y=header_y, footer_y=footer_y)
|
||||
|
||||
geometries = _split_broad_columns(geometries, content_w, left_x=left_x)
|
||||
|
||||
content_h = bottom_y - top_y
|
||||
regions = positional_column_regions(geometries, content_w, content_h, left_x)
|
||||
|
||||
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||||
methods = set(r.classification_method for r in regions if r.classification_method)
|
||||
logger.info(f"LayoutByWords: {col_count} columns detected (methods: {methods}): "
|
||||
f"{[(r.type, r.x, r.width, r.classification_confidence) for r in regions if r.type not in ('header','footer','margin_top','margin_bottom')]}")
|
||||
|
||||
return regions
|
||||
|
||||
|
||||
def detect_column_geometry_zoned(
|
||||
ocr_img: np.ndarray,
|
||||
dewarped_bgr: np.ndarray,
|
||||
) -> Optional[Tuple[
|
||||
List[ColumnGeometry],
|
||||
int, int, int, int,
|
||||
List[Dict],
|
||||
np.ndarray,
|
||||
List[Dict],
|
||||
List[DetectedBox],
|
||||
]]:
|
||||
"""Zone-aware column geometry detection.
|
||||
|
||||
1. Finds content bounds.
|
||||
2. Runs box detection.
|
||||
3. If boxes found: splits page into zones, runs detect_column_geometry()
|
||||
per content zone on the corresponding sub-image.
|
||||
4. If no boxes: delegates entirely to detect_column_geometry().
|
||||
"""
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
|
||||
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||||
if geo_result is None:
|
||||
return None
|
||||
|
||||
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
boxes = detect_boxes(dewarped_bgr, left_x, content_w, top_y, content_h)
|
||||
|
||||
if not boxes:
|
||||
zone_data = [{
|
||||
"index": 0, "zone_type": "content",
|
||||
"y": top_y, "height": content_h,
|
||||
"x": left_x, "width": content_w, "columns": [],
|
||||
}]
|
||||
return (geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zone_data, boxes)
|
||||
|
||||
zones = split_page_into_zones(left_x, top_y, content_w, content_h, boxes)
|
||||
|
||||
content_strips: List[Tuple[int, int]] = []
|
||||
for zone in zones:
|
||||
if zone.zone_type == 'content' and zone.height >= 40:
|
||||
content_strips.append((zone.y, zone.y + zone.height))
|
||||
|
||||
if not content_strips:
|
||||
logger.info("ZonedColumns: no content zones with height >= 40, using original result")
|
||||
zone_data = [{"index": 0, "zone_type": "content", "y": top_y,
|
||||
"height": content_h, "x": left_x, "width": content_w, "columns": []}]
|
||||
return (geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zone_data, boxes)
|
||||
|
||||
ocr_strips = [ocr_img[ys:ye, :] for ys, ye in content_strips]
|
||||
bgr_strips = [dewarped_bgr[ys:ye, :] for ys, ye in content_strips]
|
||||
combined_ocr = np.vstack(ocr_strips)
|
||||
combined_bgr = np.vstack(bgr_strips)
|
||||
|
||||
logger.info(f"ZonedColumns: {len(boxes)} box(es), concatenating {len(content_strips)} "
|
||||
f"content strips into combined image {combined_ocr.shape[1]}x{combined_ocr.shape[0]}")
|
||||
|
||||
combined_result = detect_column_geometry(combined_ocr, combined_bgr)
|
||||
if combined_result is not None:
|
||||
combined_geoms, c_lx, c_rx, c_ty, c_by, combined_words, combined_inv = combined_result
|
||||
else:
|
||||
logger.info("ZonedColumns: combined image column detection failed, using original")
|
||||
combined_geoms = geometries
|
||||
|
||||
strip_offsets: List[Tuple[int, int, int]] = []
|
||||
cum_y = 0
|
||||
for ys, ye in content_strips:
|
||||
h = ye - ys
|
||||
strip_offsets.append((cum_y, h, ys))
|
||||
cum_y += h
|
||||
|
||||
def _combined_y_to_abs(cy: int) -> int:
|
||||
for c_start, s_h, abs_start in strip_offsets:
|
||||
if cy < c_start + s_h:
|
||||
return abs_start + (cy - c_start)
|
||||
last_c, last_h, last_abs = strip_offsets[-1]
|
||||
return last_abs + last_h
|
||||
|
||||
if combined_result is not None:
|
||||
for g in combined_geoms:
|
||||
abs_y = _combined_y_to_abs(g.y)
|
||||
abs_y_end = _combined_y_to_abs(g.y + g.height)
|
||||
g.y = abs_y
|
||||
g.height = abs_y_end - abs_y
|
||||
|
||||
if word_dicts:
|
||||
content_words = []
|
||||
for w in word_dicts:
|
||||
w_abs_cx = w['left'] + left_x + w['width'] / 2
|
||||
w_abs_cy = w['top'] + top_y + w['height'] / 2
|
||||
inside_box = any(
|
||||
box.x <= w_abs_cx <= box.x + box.width
|
||||
and box.y <= w_abs_cy <= box.y + box.height
|
||||
for box in boxes
|
||||
)
|
||||
if not inside_box:
|
||||
content_words.append(w)
|
||||
|
||||
target_geoms = combined_geoms if combined_result is not None else geometries
|
||||
for g in target_geoms:
|
||||
g_left_rel = g.x - left_x
|
||||
g_right_rel = g_left_rel + g.width
|
||||
g.words = [
|
||||
w for w in content_words
|
||||
if g_left_rel <= w['left'] + w['width'] / 2 < g_right_rel
|
||||
]
|
||||
g.word_count = len(g.words)
|
||||
|
||||
excluded_count = len(word_dicts) - len(content_words)
|
||||
if excluded_count:
|
||||
logger.info(
|
||||
"ZonedColumns: enriched geometries with %d content words "
|
||||
"(excluded %d box-interior words)",
|
||||
len(content_words), excluded_count,
|
||||
)
|
||||
|
||||
zones_data: List[Dict] = []
|
||||
for zone in zones:
|
||||
zone_dict: Dict = {
|
||||
"index": zone.index,
|
||||
"zone_type": zone.zone_type,
|
||||
"y": zone.y,
|
||||
"height": zone.height,
|
||||
"x": zone.x,
|
||||
"width": zone.width,
|
||||
"columns": [],
|
||||
}
|
||||
if zone.box is not None:
|
||||
zone_dict["box"] = {
|
||||
"x": zone.box.x, "y": zone.box.y,
|
||||
"width": zone.box.width, "height": zone.box.height,
|
||||
"confidence": zone.box.confidence,
|
||||
"border_thickness": zone.box.border_thickness,
|
||||
}
|
||||
zones_data.append(zone_dict)
|
||||
|
||||
all_geometries = combined_geoms if combined_geoms else geometries
|
||||
|
||||
logger.info(f"ZonedColumns: {len(boxes)} box(es), {len(zones)} zone(s), "
|
||||
f"{len(all_geometries)} total columns (combined-image approach)")
|
||||
|
||||
return (all_geometries, left_x, right_x, top_y, bottom_y,
|
||||
word_dicts, inv, zones_data, boxes)
|
||||
@@ -0,0 +1,329 @@
|
||||
"""
|
||||
Row grid regularization for document layout analysis.
|
||||
|
||||
Provides word-center-based row boundary refinement to improve
|
||||
gap-based row detection. Extracted from cv_layout_rows.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _regularize_row_grid(
|
||||
rows: List['RowGeometry'],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int,
|
||||
content_w: int, content_h: int,
|
||||
inv: np.ndarray,
|
||||
) -> List['RowGeometry']:
|
||||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||||
|
||||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||||
from the words themselves:
|
||||
|
||||
Step A: Group all content words into line clusters by Y-proximity.
|
||||
Tolerance = 40% of median gap-based row height.
|
||||
Step B: For each cluster compute:
|
||||
- center_y = median of (word_top + word_height/2) for all words
|
||||
- letter_h = median of word heights (excluding outliers > 2× median)
|
||||
Step B2: Merge clusters whose centers are closer than 30% of row height
|
||||
(spurious splits from OCR jitter).
|
||||
Step C: Compute pitches (distances between consecutive centers).
|
||||
Detect section breaks where gap > 1.8× median pitch.
|
||||
Step D: Split clusters into sections at the section breaks.
|
||||
Step E: Within each section, place row boundaries at midpoints between
|
||||
consecutive line centers:
|
||||
- First row top = center - local_pitch/2
|
||||
- Last row bottom = center + local_pitch/2
|
||||
- Interior boundaries = (center_i + center_{i+1}) / 2
|
||||
This ensures rows tile seamlessly without gaps or overlaps.
|
||||
Step F: Re-assign words to the nearest grid row by vertical center distance.
|
||||
Step G: Validate that >= 85% of words land in a grid row; otherwise
|
||||
fall back to the original gap-based rows.
|
||||
Step H: Merge with preserved header/footer rows and re-index.
|
||||
|
||||
Guard: Requires >= 5 content rows from gap-based detection to activate.
|
||||
This prevents the regularizer from running on very small images (e.g.
|
||||
box sub-sessions with only 3-6 rows) where the gap-based detection
|
||||
is already accurate enough.
|
||||
|
||||
Header/footer rows from the gap-based detection are preserved.
|
||||
"""
|
||||
content_rows = [r for r in rows if r.row_type == 'content']
|
||||
non_content = [r for r in rows if r.row_type != 'content']
|
||||
|
||||
if len(content_rows) < 5:
|
||||
return rows
|
||||
|
||||
# --- Step A: Group ALL words into line clusters ---
|
||||
# Collect words that belong to content rows (deduplicated)
|
||||
content_words: List[Dict] = []
|
||||
seen_keys: set = set()
|
||||
for r in content_rows:
|
||||
for w in r.words:
|
||||
key = (w['left'], w['top'], w['width'], w['height'])
|
||||
if key not in seen_keys:
|
||||
seen_keys.add(key)
|
||||
content_words.append(w)
|
||||
|
||||
if len(content_words) < 5:
|
||||
return rows
|
||||
|
||||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||||
word_heights = sorted(w['height'] for w in content_words)
|
||||
median_wh = word_heights[len(word_heights) // 2]
|
||||
|
||||
# Compute median gap-based row height — this is the actual line height
|
||||
# as detected by the horizontal projection. We use 40% of this as
|
||||
# grouping tolerance. This is much more reliable than using word height
|
||||
# alone, because words on the same line can have very different heights
|
||||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||||
gap_row_heights = sorted(r.height for r in content_rows)
|
||||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||||
|
||||
# Tolerance: 40% of row height. Words on the same line should have
|
||||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||||
# its center should stay within half a row height of the line center.
|
||||
y_tol = max(10, int(median_row_h * 0.4))
|
||||
|
||||
# Sort by center_y, then group by proximity
|
||||
words_by_center = sorted(content_words,
|
||||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||||
line_clusters: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [words_by_center[0]]
|
||||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||||
|
||||
for w in words_by_center[1:]:
|
||||
w_center = w['top'] + w['height'] / 2
|
||||
if abs(w_center - current_center) <= y_tol:
|
||||
current_line.append(w)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
current_line = [w]
|
||||
current_center = w_center
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
|
||||
if len(line_clusters) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step B: Compute center_y per cluster ---
|
||||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||||
# letter_h = median of word heights, but excluding outlier-height words
|
||||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||||
cluster_info: List[Dict] = []
|
||||
for cl_words in line_clusters:
|
||||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||||
# Filter outlier heights for letter_h computation
|
||||
normal_heights = [w['height'] for w in cl_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in cl_words]
|
||||
center_y = float(np.median(centers))
|
||||
letter_h = float(np.median(normal_heights))
|
||||
cluster_info.append({
|
||||
'center_y_rel': center_y, # relative to content ROI
|
||||
'center_y_abs': center_y + top_y, # absolute
|
||||
'letter_h': letter_h,
|
||||
'words': cl_words,
|
||||
})
|
||||
|
||||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||||
|
||||
# --- Step B2: Merge clusters that are too close together ---
|
||||
# Even with center-based grouping, some edge cases can produce
|
||||
# spurious clusters. Merge any pair whose centers are closer
|
||||
# than 30% of the row height (they're definitely the same text line).
|
||||
merge_threshold = max(8, median_row_h * 0.3)
|
||||
merged: List[Dict] = [cluster_info[0]]
|
||||
for cl in cluster_info[1:]:
|
||||
prev = merged[-1]
|
||||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||||
# Merge: combine words, recompute center
|
||||
combined_words = prev['words'] + cl['words']
|
||||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||||
normal_heights = [w['height'] for w in combined_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in combined_words]
|
||||
prev['center_y_rel'] = float(np.median(centers))
|
||||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||||
prev['letter_h'] = float(np.median(normal_heights))
|
||||
prev['words'] = combined_words
|
||||
else:
|
||||
merged.append(cl)
|
||||
|
||||
cluster_info = merged
|
||||
|
||||
if len(cluster_info) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step C: Compute pitches and detect section breaks ---
|
||||
pitches: List[float] = []
|
||||
for i in range(1, len(cluster_info)):
|
||||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
pitches.append(pitch)
|
||||
|
||||
if not pitches:
|
||||
return rows
|
||||
|
||||
median_pitch = float(np.median(pitches))
|
||||
if median_pitch <= 5:
|
||||
return rows
|
||||
|
||||
# A section break is where the gap between line centers is much larger
|
||||
# than the normal pitch (sub-headings, section titles, etc.)
|
||||
BREAK_FACTOR = 1.8
|
||||
|
||||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||||
sections: List[List[Dict]] = []
|
||||
current_section: List[Dict] = [cluster_info[0]]
|
||||
|
||||
for i in range(1, len(cluster_info)):
|
||||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
if gap > median_pitch * BREAK_FACTOR:
|
||||
sections.append(current_section)
|
||||
current_section = [cluster_info[i]]
|
||||
else:
|
||||
current_section.append(cluster_info[i])
|
||||
|
||||
if current_section:
|
||||
sections.append(current_section)
|
||||
|
||||
# --- Step E: Build row boundaries per section ---
|
||||
grid_rows: List[RowGeometry] = []
|
||||
|
||||
for section in sections:
|
||||
if not section:
|
||||
continue
|
||||
|
||||
if len(section) == 1:
|
||||
# Single-line section (likely a heading)
|
||||
cl = section[0]
|
||||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||||
row_top = cl['center_y_abs'] - half_h
|
||||
row_bot = cl['center_y_abs'] + half_h
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
continue
|
||||
|
||||
# Compute local pitch for this section
|
||||
local_pitches = []
|
||||
for i in range(1, len(section)):
|
||||
local_pitches.append(
|
||||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||||
)
|
||||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||||
|
||||
# Row boundaries are placed at midpoints between consecutive centers.
|
||||
# First row: top = center - local_pitch/2
|
||||
# Last row: bottom = center + local_pitch/2
|
||||
for i, cl in enumerate(section):
|
||||
if i == 0:
|
||||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||||
else:
|
||||
# Midpoint between this center and previous center
|
||||
prev_center = section[i - 1]['center_y_abs']
|
||||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||||
|
||||
if i == len(section) - 1:
|
||||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||||
else:
|
||||
next_center = section[i + 1]['center_y_abs']
|
||||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||||
|
||||
# Clamp to reasonable bounds
|
||||
row_top = max(top_y, row_top)
|
||||
row_bot = min(top_y + content_h, row_bot)
|
||||
|
||||
if row_bot - row_top < 5:
|
||||
continue
|
||||
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
|
||||
if not grid_rows:
|
||||
return rows
|
||||
|
||||
# --- Step F: Re-assign words to grid rows ---
|
||||
# Words may have shifted slightly; assign each word to the row whose
|
||||
# center is closest to the word's vertical center.
|
||||
for gr in grid_rows:
|
||||
gr.words = []
|
||||
|
||||
for w in content_words:
|
||||
w_center = w['top'] + top_y + w['height'] / 2
|
||||
best_row = None
|
||||
best_dist = float('inf')
|
||||
for gr in grid_rows:
|
||||
row_center = gr.y + gr.height / 2
|
||||
dist = abs(w_center - row_center)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_row = gr
|
||||
if best_row is not None and best_dist < median_pitch:
|
||||
best_row.words.append(w)
|
||||
|
||||
for gr in grid_rows:
|
||||
gr.word_count = len(gr.words)
|
||||
|
||||
# --- Step G: Validate ---
|
||||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||||
if len(content_words) > 0:
|
||||
match_ratio = words_placed / len(content_words)
|
||||
if match_ratio < 0.85:
|
||||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||||
f"of words, keeping gap-based rows")
|
||||
return rows
|
||||
|
||||
# Remove empty grid rows (no words assigned)
|
||||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||||
|
||||
# --- Step H: Merge header/footer + re-index ---
|
||||
result = list(non_content) + grid_rows
|
||||
result.sort(key=lambda r: r.y)
|
||||
for i, r in enumerate(result):
|
||||
r.index = i
|
||||
|
||||
row_heights = [gr.height for gr in grid_rows]
|
||||
min_h = min(row_heights) if row_heights else 0
|
||||
max_h = max(row_heights) if row_heights else 0
|
||||
logger.info(f"RowGrid: word-center grid applied "
|
||||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||||
f"{len(sections)} sections, "
|
||||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||||
f"was {len(content_rows)} gap-based rows)")
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
Row geometry detection for document layout analysis.
|
||||
|
||||
Provides horizontal whitespace-gap analysis to detect text rows,
|
||||
word-center grid regularization, and fallback word-grouping.
|
||||
|
||||
Extracted from cv_layout.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
from cv_ocr_word_assembly import _group_words_into_lines
|
||||
from cv_layout_row_regularize import _regularize_row_grid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||||
# =============================================================================
|
||||
|
||||
def detect_row_geometry(
|
||||
inv: np.ndarray,
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int, bottom_y: int,
|
||||
) -> List['RowGeometry']:
|
||||
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||||
|
||||
Algorithm overview (two phases):
|
||||
|
||||
Phase 1 — Gap-based detection (Steps 1–6):
|
||||
1. Build a horizontal projection profile: for each y-pixel, sum the
|
||||
ink density across the content width. Only pixels within/near
|
||||
Tesseract word bounding boxes contribute (word_mask), so that
|
||||
images/illustrations don't merge adjacent text rows.
|
||||
2. Smooth the projection and find contiguous regions below a
|
||||
threshold (= gaps / horizontal whitespace between text lines).
|
||||
The threshold is 15% of the median non-zero density.
|
||||
3. Validate gaps against word bounding boxes — discard any gap
|
||||
that overlaps a word, or shift the gap boundary to avoid the word.
|
||||
4. Build rows from the spans between validated gaps.
|
||||
5. Detect header/footer rows: gaps in the top/bottom 15% of the
|
||||
page that are >= 2× the median gap size mark section boundaries.
|
||||
|
||||
Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
|
||||
For each word, compute its vertical center (top + height/2).
|
||||
Group words into line clusters by Y-proximity (tolerance = 40% of
|
||||
the median gap-based row height).
|
||||
For each cluster, the line center = median of all word centers.
|
||||
The "pitch" = distance between consecutive line centers.
|
||||
Section breaks are detected where the pitch exceeds 1.8× the median.
|
||||
Within each section, row boundaries are placed at the midpoints
|
||||
between consecutive line centers:
|
||||
- Row top = midpoint to previous line center (or center - pitch/2 for first)
|
||||
- Row bottom = midpoint to next line center (or center + pitch/2 for last)
|
||||
This ensures rows tile without gaps or overlaps.
|
||||
|
||||
Fallback:
|
||||
If < 2 gaps are found (very dense or uniform text), falls back to
|
||||
_build_rows_from_word_grouping() which groups words by Y proximity.
|
||||
|
||||
Args:
|
||||
inv: Inverted binarized image (white text on black bg, full page).
|
||||
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||||
left_x, right_x: Absolute X bounds of the content area.
|
||||
top_y, bottom_y: Absolute Y bounds of the content area.
|
||||
|
||||
Returns:
|
||||
List of RowGeometry objects sorted top to bottom.
|
||||
"""
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
if content_h < 10 or content_w < 10:
|
||||
logger.warning("detect_row_geometry: content area too small")
|
||||
return []
|
||||
|
||||
# --- Step 1: Horizontal projection profile ---
|
||||
# For each y-pixel row, sum ink density across the content width.
|
||||
# A word-coverage mask ensures only pixels near Tesseract words contribute,
|
||||
# so that illustrations/images don't inflate the density and merge rows.
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||||
for wd in word_dicts:
|
||||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||||
x1 = max(0, wd['left'])
|
||||
x2 = min(content_w, wd['left'] + wd['width'])
|
||||
word_mask[y1:y2, x1:x2] = 255
|
||||
|
||||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||||
|
||||
# --- Step 2: Smoothing + gap threshold ---
|
||||
# Smooth the projection to reduce noise, then threshold at 15% of the
|
||||
# median non-zero density. Pixels below this threshold are considered
|
||||
# "gap" (horizontal whitespace between text lines).
|
||||
# MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
|
||||
kernel_size = max(3, content_h // 200)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.003)
|
||||
|
||||
in_gap = h_smooth < gap_threshold
|
||||
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||||
|
||||
# --- Step 3: Collect contiguous gap regions ---
|
||||
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||||
gap_start = None
|
||||
for y in range(len(in_gap)):
|
||||
if in_gap[y]:
|
||||
if gap_start is None:
|
||||
gap_start = y
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_height = y - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, y))
|
||||
gap_start = None
|
||||
if gap_start is not None:
|
||||
gap_height = len(in_gap) - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||
f"min_height={MIN_GAP_HEIGHT}px)")
|
||||
|
||||
# --- Step 4: Validate gaps against word bounding boxes ---
|
||||
# A gap is valid only if no word's bounding box overlaps it vertically.
|
||||
# If a word overlaps, try to shift the gap boundary above or below the
|
||||
# word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
overlapping = False
|
||||
for wd in word_dicts:
|
||||
word_top = wd['top']
|
||||
word_bottom = wd['top'] + wd['height']
|
||||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||
overlapping = True
|
||||
break
|
||||
|
||||
if not overlapping:
|
||||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||
else:
|
||||
# Try to shift the gap to avoid overlapping words
|
||||
min_word_top = content_h
|
||||
max_word_bottom = 0
|
||||
for wd in word_dicts:
|
||||
word_top = wd['top']
|
||||
word_bottom = wd['top'] + wd['height']
|
||||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||
min_word_top = min(min_word_top, word_top)
|
||||
max_word_bottom = max(max_word_bottom, word_bottom)
|
||||
|
||||
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||||
validated_gaps.append((gap_start_rel, min_word_top))
|
||||
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||||
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||||
else:
|
||||
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"discarded (word overlap, no room to shift)")
|
||||
|
||||
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||||
|
||||
# --- Fallback if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||||
return _build_rows_from_word_grouping(
|
||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||||
)
|
||||
|
||||
validated_gaps.sort(key=lambda g: g[0])
|
||||
|
||||
# --- Step 5: Header/footer detection via gap size ---
|
||||
HEADER_FOOTER_ZONE = 0.15
|
||||
GAP_MULTIPLIER = 2.0
|
||||
|
||||
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||||
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||
|
||||
header_boundary_rel = None # y below which is header
|
||||
footer_boundary_rel = None # y above which is footer
|
||||
|
||||
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||||
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||
|
||||
# Find largest gap in header zone
|
||||
best_header_gap = None
|
||||
for gs, ge in validated_gaps:
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||||
best_header_gap = (gs, ge)
|
||||
|
||||
if best_header_gap is not None:
|
||||
header_boundary_rel = best_header_gap[1]
|
||||
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||||
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||||
f"median_gap={median_gap:.0f}px)")
|
||||
|
||||
# Find largest gap in footer zone
|
||||
best_footer_gap = None
|
||||
for gs, ge in validated_gaps:
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||||
best_footer_gap = (gs, ge)
|
||||
|
||||
if best_footer_gap is not None:
|
||||
footer_boundary_rel = best_footer_gap[0]
|
||||
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||||
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||||
|
||||
# --- Step 6: Build RowGeometry objects from gaps ---
|
||||
# Rows are the spans between consecutive gaps. The gap midpoints define
|
||||
# where one row ends and the next begins. Each row's height extends
|
||||
# from the end of the previous gap to the start of the next gap.
|
||||
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||||
|
||||
# Top of content to first gap
|
||||
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||||
row_boundaries.append((0, validated_gaps[0][0]))
|
||||
|
||||
# Between gaps
|
||||
for i in range(len(validated_gaps) - 1):
|
||||
row_start = validated_gaps[i][1]
|
||||
row_end = validated_gaps[i + 1][0]
|
||||
if row_end - row_start > 0:
|
||||
row_boundaries.append((row_start, row_end))
|
||||
|
||||
# Last gap to bottom of content
|
||||
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||||
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||||
|
||||
rows = []
|
||||
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||||
# Determine row type
|
||||
row_mid = (row_start_rel + row_end_rel) / 2
|
||||
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||||
row_type = 'header'
|
||||
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||||
row_type = 'footer'
|
||||
else:
|
||||
row_type = 'content'
|
||||
|
||||
# Collect words in this row
|
||||
row_words = [w for w in word_dicts
|
||||
if w['top'] + w['height'] / 2 >= row_start_rel
|
||||
and w['top'] + w['height'] / 2 < row_end_rel]
|
||||
|
||||
# Gap before this row
|
||||
gap_before = 0
|
||||
if idx == 0 and validated_gaps[0][0] > 0:
|
||||
gap_before = validated_gaps[0][0]
|
||||
elif idx > 0:
|
||||
# Find the gap just before this row boundary
|
||||
for gs, ge in validated_gaps:
|
||||
if ge == row_start_rel:
|
||||
gap_before = ge - gs
|
||||
break
|
||||
|
||||
rows.append(RowGeometry(
|
||||
index=idx,
|
||||
x=left_x,
|
||||
y=top_y + row_start_rel,
|
||||
width=content_w,
|
||||
height=row_end_rel - row_start_rel,
|
||||
word_count=len(row_words),
|
||||
words=row_words,
|
||||
row_type=row_type,
|
||||
gap_before=gap_before,
|
||||
))
|
||||
|
||||
# --- Step 7: Word-center grid regularization ---
|
||||
# Refine the gap-based rows using word vertical centers. For each word,
|
||||
# compute center_y = top + height/2. Group into line clusters, compute
|
||||
# the pitch (distance between consecutive line centers), and place row
|
||||
# boundaries at the midpoints between centers. This gives more precise
|
||||
# and evenly-spaced rows than the gap-based approach alone.
|
||||
# Also detects section breaks (headings, paragraphs) where the pitch
|
||||
# exceeds 1.8× the median, and handles each section independently.
|
||||
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
|
||||
content_w, content_h, inv)
|
||||
|
||||
type_counts = {}
|
||||
for r in rows:
|
||||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||||
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def _build_rows_from_word_grouping(
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int, bottom_y: int,
|
||||
content_w: int, content_h: int,
|
||||
) -> List['RowGeometry']:
|
||||
"""Fallback: build rows by grouping words by Y position.
|
||||
|
||||
Uses _group_words_into_lines() with a generous tolerance.
|
||||
No header/footer detection in fallback mode.
|
||||
"""
|
||||
if not word_dicts:
|
||||
return []
|
||||
|
||||
y_tolerance = max(20, content_h // 100)
|
||||
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||||
|
||||
rows = []
|
||||
for idx, line_words in enumerate(lines):
|
||||
if not line_words:
|
||||
continue
|
||||
min_top = min(w['top'] for w in line_words)
|
||||
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||||
row_height = max_bottom - min_top
|
||||
|
||||
rows.append(RowGeometry(
|
||||
index=idx,
|
||||
x=left_x,
|
||||
y=top_y + min_top,
|
||||
width=content_w,
|
||||
height=row_height,
|
||||
word_count=len(line_words),
|
||||
words=line_words,
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
|
||||
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||||
return rows
|
||||
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Language scoring, role scoring, and dictionary detection/classification.
|
||||
|
||||
Extracted from cv_layout.py to keep modules under 500 LOC.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
ColumnGeometry,
|
||||
ENGLISH_FUNCTION_WORDS,
|
||||
GERMAN_FUNCTION_WORDS,
|
||||
PageRegion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Dictionary / Wörterbuch Detection ---
|
||||
|
||||
# Article words that appear as a dedicated column in dictionaries
|
||||
_DICT_ARTICLE_WORDS = {
|
||||
# German articles
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
|
||||
# English articles / infinitive marker
|
||||
"the", "a", "an", "to",
|
||||
}
|
||||
|
||||
|
||||
# --- Phase B: Content-Based Classification ---
|
||||
|
||||
def _score_language(words: List[Dict]) -> Dict[str, float]:
|
||||
"""Score the language of a column's words.
|
||||
|
||||
Analyzes function words, umlauts, and capitalization patterns
|
||||
to determine whether text is English or German.
|
||||
|
||||
Args:
|
||||
words: List of word dicts with 'text' and 'conf' keys.
|
||||
|
||||
Returns:
|
||||
Dict with 'eng' and 'deu' scores (0.0-1.0).
|
||||
"""
|
||||
if not words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
# Only consider words with decent confidence
|
||||
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
|
||||
if not good_words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
total = len(good_words)
|
||||
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
|
||||
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
|
||||
|
||||
# Check for umlauts (strong German signal)
|
||||
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
|
||||
umlaut_count = sum(1 for t in raw_texts
|
||||
for c in t if c in 'äöüÄÖÜß')
|
||||
|
||||
# German capitalization: nouns are capitalized mid-sentence
|
||||
# Count words that start with uppercase but aren't at position 0
|
||||
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
|
||||
|
||||
en_score = en_hits / total if total > 0 else 0.0
|
||||
de_score = de_hits / total if total > 0 else 0.0
|
||||
|
||||
# Boost German score for umlauts
|
||||
if umlaut_count > 0:
|
||||
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
|
||||
|
||||
# Boost German score for high capitalization ratio (typical for German nouns)
|
||||
if total > 5:
|
||||
cap_ratio = cap_words / total
|
||||
if cap_ratio > 0.3:
|
||||
de_score = min(1.0, de_score + 0.1)
|
||||
|
||||
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
|
||||
|
||||
|
||||
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||||
"""Score the role of a column based on its geometry and content patterns.
|
||||
|
||||
Args:
|
||||
geom: ColumnGeometry with words and dimensions.
|
||||
|
||||
Returns:
|
||||
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
|
||||
"""
|
||||
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
|
||||
|
||||
if not geom.words:
|
||||
return scores
|
||||
|
||||
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
|
||||
if not texts:
|
||||
return scores
|
||||
|
||||
avg_word_len = sum(len(t) for t in texts) / len(texts)
|
||||
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
|
||||
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
|
||||
digit_ratio = digit_words / len(texts) if texts else 0.0
|
||||
|
||||
# Reference: narrow + mostly numbers/page references
|
||||
if geom.width_ratio < 0.12:
|
||||
scores['reference'] = 0.5
|
||||
if digit_ratio > 0.4:
|
||||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||||
|
||||
# Marker: narrow + few short entries
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
scores['marker'] = 0.7
|
||||
if avg_word_len < 4:
|
||||
scores['marker'] = 0.9
|
||||
# Very narrow non-edge column → strong marker regardless of word count
|
||||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||||
scores['marker'] = max(scores['marker'], 0.9)
|
||||
|
||||
# Sentence: longer words + punctuation present
|
||||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||||
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
|
||||
if avg_word_len > 4:
|
||||
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
|
||||
|
||||
# Vocabulary: medium width + medium word length
|
||||
if 0.10 < geom.width_ratio < 0.45:
|
||||
scores['vocabulary'] = 0.4
|
||||
if 3 < avg_word_len < 8:
|
||||
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
|
||||
|
||||
return {k: round(v, 3) for k, v in scores.items()}
|
||||
|
||||
|
||||
def _score_dictionary_signals(
|
||||
geometries: List[ColumnGeometry],
|
||||
document_category: Optional[str] = None,
|
||||
margin_strip_detected: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Score dictionary-specific patterns across all columns.
|
||||
|
||||
Combines 4 independent signals to determine if the page is a dictionary:
|
||||
1. Alphabetical ordering of words in each column
|
||||
2. Article column detection (der/die/das, to)
|
||||
3. First-letter uniformity (most headwords share a letter)
|
||||
4. Decorative A-Z margin strip (detected upstream)
|
||||
|
||||
Args:
|
||||
geometries: List of ColumnGeometry with words.
|
||||
document_category: User-selected category (e.g. 'woerterbuch').
|
||||
margin_strip_detected: Whether a decorative A-Z margin strip was found.
|
||||
|
||||
Returns:
|
||||
Dict with 'is_dictionary', 'confidence', 'article_col_index',
|
||||
'headword_col_index', and 'signals' sub-dict.
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"is_dictionary": False,
|
||||
"confidence": 0.0,
|
||||
"article_col_index": None,
|
||||
"headword_col_index": None,
|
||||
"signals": {},
|
||||
}
|
||||
|
||||
if not geometries or len(geometries) < 2:
|
||||
return result
|
||||
|
||||
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
|
||||
best_alpha_score = 0.0
|
||||
best_alpha_col = -1
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Deduplicate consecutive identical words (OCR double-reads)
|
||||
deduped = [texts[0]]
|
||||
for t in texts[1:]:
|
||||
if t != deduped[-1]:
|
||||
deduped.append(t)
|
||||
if len(deduped) < 5:
|
||||
continue
|
||||
# Count consecutive pairs in alphabetical order
|
||||
ordered_pairs = sum(
|
||||
1 for i in range(len(deduped) - 1)
|
||||
if deduped[i] <= deduped[i + 1]
|
||||
)
|
||||
alpha_score = ordered_pairs / (len(deduped) - 1)
|
||||
if alpha_score > best_alpha_score:
|
||||
best_alpha_score = alpha_score
|
||||
best_alpha_col = geom.index
|
||||
|
||||
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
|
||||
result["signals"]["alphabetical_col"] = best_alpha_col
|
||||
|
||||
# --- Signal 2: Article detection (weight 0.25) ---
|
||||
# Check three patterns:
|
||||
# (a) Dedicated narrow article column (der/die/das only)
|
||||
# (b) Inline articles: multi-word texts starting with "der X", "die X"
|
||||
# (c) High article word frequency: many individual words ARE articles
|
||||
# (common when OCR splits "der Zustand" into separate word_boxes)
|
||||
best_article_density = 0.0
|
||||
best_article_col = -1
|
||||
best_inline_article_ratio = 0.0
|
||||
best_article_word_ratio = 0.0
|
||||
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in geom.words
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
|
||||
]
|
||||
if len(texts) < 3:
|
||||
continue
|
||||
|
||||
# (a) Dedicated article column: narrow, mostly article words
|
||||
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
|
||||
if geom.width_ratio <= 0.20:
|
||||
density = article_count / len(texts)
|
||||
if density > best_article_density:
|
||||
best_article_density = density
|
||||
best_article_col = geom.index
|
||||
|
||||
# (b) Inline articles: "der Zustand", "die Zutat", etc.
|
||||
inline_count = sum(
|
||||
1 for t in texts
|
||||
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
|
||||
)
|
||||
inline_ratio = inline_count / len(texts)
|
||||
if inline_ratio > best_inline_article_ratio:
|
||||
best_inline_article_ratio = inline_ratio
|
||||
|
||||
# (c) Article word frequency in any column (for OCR-split word_boxes)
|
||||
# In dictionaries, articles appear frequently among headwords
|
||||
# Require at least 10% articles and >= 3 article words
|
||||
if article_count >= 3:
|
||||
art_ratio = article_count / len(texts)
|
||||
# Only count if column has enough non-article words too
|
||||
# (pure article column is handled by (a))
|
||||
non_art = len(texts) - article_count
|
||||
if non_art >= 3 and art_ratio > best_article_word_ratio:
|
||||
best_article_word_ratio = art_ratio
|
||||
|
||||
# Use the strongest signal
|
||||
effective_article_score = max(
|
||||
best_article_density,
|
||||
best_inline_article_ratio,
|
||||
best_article_word_ratio * 0.8, # slight discount for raw word ratio
|
||||
)
|
||||
|
||||
result["signals"]["article_density"] = round(best_article_density, 3)
|
||||
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
|
||||
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
|
||||
result["signals"]["article_col"] = best_article_col
|
||||
|
||||
# --- Signal 3: First-letter uniformity (weight 0.25) ---
|
||||
best_uniformity = 0.0
|
||||
best_uniform_col = -1
|
||||
has_letter_transition = False
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Count first letters
|
||||
first_letters = [t[0] for t in texts if t[0].isalpha()]
|
||||
if not first_letters:
|
||||
continue
|
||||
letter_counts = Counter(first_letters)
|
||||
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
|
||||
uniformity = most_common_count / len(first_letters)
|
||||
|
||||
# Check for orderly letter transitions (A→B or Y→Z)
|
||||
# Group consecutive words by first letter, check if groups are in order
|
||||
groups = []
|
||||
current_letter = first_letters[0]
|
||||
for fl in first_letters:
|
||||
if fl != current_letter:
|
||||
groups.append(current_letter)
|
||||
current_letter = fl
|
||||
groups.append(current_letter)
|
||||
if len(groups) >= 2 and len(groups) <= 5:
|
||||
# Check if groups are alphabetically ordered
|
||||
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
|
||||
has_letter_transition = True
|
||||
# Boost uniformity for orderly transitions
|
||||
uniformity = max(uniformity, 0.70)
|
||||
|
||||
if uniformity > best_uniformity:
|
||||
best_uniformity = uniformity
|
||||
best_uniform_col = geom.index
|
||||
|
||||
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
|
||||
result["signals"]["uniform_col"] = best_uniform_col
|
||||
result["signals"]["has_letter_transition"] = has_letter_transition
|
||||
|
||||
# --- Signal 4: Decorative margin strip (weight 0.15) ---
|
||||
result["signals"]["margin_strip_detected"] = margin_strip_detected
|
||||
|
||||
# --- Combine signals ---
|
||||
s1 = min(best_alpha_score, 1.0) * 0.35
|
||||
s2 = min(effective_article_score, 1.0) * 0.25
|
||||
s3 = min(best_uniformity, 1.0) * 0.25
|
||||
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
|
||||
|
||||
combined = s1 + s2 + s3 + s4
|
||||
|
||||
# Boost if user set document_category to 'woerterbuch'
|
||||
if document_category == "woerterbuch":
|
||||
combined = min(1.0, combined + 0.20)
|
||||
result["signals"]["category_boost"] = True
|
||||
|
||||
result["confidence"] = round(combined, 3)
|
||||
|
||||
# Threshold: combined >= 0.40 to classify as dictionary
|
||||
# (at least 2 strong signals or 3 moderate ones)
|
||||
if combined >= 0.40:
|
||||
result["is_dictionary"] = True
|
||||
# Identify headword column: best alphabetical OR best uniform
|
||||
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
|
||||
result["headword_col_index"] = best_alpha_col
|
||||
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
|
||||
result["headword_col_index"] = best_uniform_col
|
||||
if best_article_col >= 0 and best_article_density >= 0.30:
|
||||
result["article_col_index"] = best_article_col
|
||||
# If inline articles are strong but no dedicated column, note it
|
||||
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
|
||||
result["signals"]["inline_articles_detected"] = True
|
||||
|
||||
logger.info(
|
||||
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
|
||||
combined, result["is_dictionary"], result["signals"],
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _classify_dictionary_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
dict_signals: Dict[str, Any],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
content_h: int,
|
||||
) -> Optional[List[PageRegion]]:
|
||||
"""Classify columns for a detected dictionary page.
|
||||
|
||||
Assigns column_headword, column_article, column_ipa, and
|
||||
column_de/column_en based on dictionary signals and language scores.
|
||||
|
||||
Returns None if classification fails.
|
||||
"""
|
||||
if not dict_signals.get("is_dictionary"):
|
||||
return None
|
||||
|
||||
regions: List[PageRegion] = []
|
||||
assigned = set()
|
||||
article_idx = dict_signals.get("article_col_index")
|
||||
headword_idx = dict_signals.get("headword_col_index")
|
||||
|
||||
# 1. Assign article column if detected
|
||||
if article_idx is not None:
|
||||
for geom in geometries:
|
||||
if geom.index == article_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_article",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["signals"].get("article_density", 0.5), 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 2. Assign headword column
|
||||
if headword_idx is not None and headword_idx not in assigned:
|
||||
for geom in geometries:
|
||||
if geom.index == headword_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_headword",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["confidence"], 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 3. Assign remaining columns by language + content
|
||||
remaining = [g for g in geometries if g.index not in assigned]
|
||||
for geom in remaining:
|
||||
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
|
||||
|
||||
# Check if column contains IPA (brackets like [, /, ˈ)
|
||||
ipa_chars = sum(
|
||||
1 for w in geom.words
|
||||
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
|
||||
)
|
||||
ipa_ratio = ipa_chars / max(len(geom.words), 1)
|
||||
|
||||
if ipa_ratio > 0.25:
|
||||
col_type = "column_ipa"
|
||||
conf = round(min(1.0, ipa_ratio), 2)
|
||||
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
|
||||
col_type = "column_de"
|
||||
conf = round(ls["deu"], 2)
|
||||
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
|
||||
col_type = "column_en"
|
||||
conf = round(ls["eng"], 2)
|
||||
else:
|
||||
# Positional fallback: leftmost unassigned = EN, next = DE
|
||||
left_unassigned = sorted(
|
||||
[g for g in remaining if g.index not in assigned],
|
||||
key=lambda g: g.x,
|
||||
)
|
||||
if geom == left_unassigned[0] if left_unassigned else None:
|
||||
col_type = "column_en"
|
||||
else:
|
||||
col_type = "column_de"
|
||||
conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type=col_type,
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
CV-based Document Reconstruction Pipeline for Vocabulary Extraction.
|
||||
|
||||
Re-export facade — all logic lives in the sub-modules:
|
||||
|
||||
cv_vocab_types Dataklassen, Konstanten, IPA, Feature-Flags
|
||||
cv_preprocessing Bild-I/O, Orientierung, Deskew, Dewarp
|
||||
cv_layout Dokumenttyp, Spalten, Zeilen, Klassifikation
|
||||
cv_ocr_engines OCR-Engines, Vocab-Postprocessing, Text-Cleaning
|
||||
cv_cell_grid Cell-Grid (v2 + Legacy), Vocab-Konvertierung
|
||||
cv_review LLM/Spell Review, Pipeline-Orchestrierung
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
from cv_vocab_types import * # noqa: F401,F403
|
||||
from cv_preprocessing import * # noqa: F401,F403
|
||||
from cv_layout import * # noqa: F401,F403
|
||||
from cv_ocr_engines import * # noqa: F401,F403
|
||||
from cv_cell_grid import * # noqa: F401,F403
|
||||
from cv_box_detect import * # noqa: F401,F403
|
||||
from cv_review import * # noqa: F401,F403
|
||||
|
||||
# Private names used by consumers — not covered by wildcard re-exports.
|
||||
from cv_preprocessing import _apply_shear # noqa: F401
|
||||
from cv_layout import ( # noqa: F401
|
||||
_detect_header_footer_gaps,
|
||||
_detect_sub_columns,
|
||||
_split_broad_columns,
|
||||
)
|
||||
from cv_ocr_engines import ( # noqa: F401
|
||||
_fix_character_confusion,
|
||||
_fix_phonetic_brackets,
|
||||
)
|
||||
from cv_cell_grid import _cells_to_vocab_entries # noqa: F401
|
||||
from cv_words_first import build_grid_from_words # noqa: F401
|
||||
@@ -0,0 +1,437 @@
|
||||
"""
|
||||
CV Preprocessing Deskew — Rotation correction via Hough lines, word alignment, and iterative projection.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
CV2_AVAILABLE,
|
||||
TESSERACT_AVAILABLE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Deskew via Hough Lines
|
||||
# =============================================================================
|
||||
|
||||
def deskew_image(img: np.ndarray) -> Tuple[np.ndarray, float]:
|
||||
"""Correct rotation using Hough Line detection.
|
||||
|
||||
Args:
|
||||
img: BGR image.
|
||||
|
||||
Returns:
|
||||
Tuple of (corrected image, detected angle in degrees).
|
||||
"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
lines = cv2.HoughLinesP(binary, 1, np.pi / 180, threshold=100,
|
||||
minLineLength=img.shape[1] // 4, maxLineGap=20)
|
||||
|
||||
if lines is None or len(lines) < 3:
|
||||
return img, 0.0
|
||||
|
||||
angles = []
|
||||
for line in lines:
|
||||
x1, y1, x2, y2 = line[0]
|
||||
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
|
||||
if abs(angle) < 15:
|
||||
angles.append(angle)
|
||||
|
||||
if not angles:
|
||||
return img, 0.0
|
||||
|
||||
median_angle = float(np.median(angles))
|
||||
|
||||
if abs(median_angle) > 5.0:
|
||||
median_angle = 5.0 * np.sign(median_angle)
|
||||
|
||||
if abs(median_angle) < 0.1:
|
||||
return img, 0.0
|
||||
|
||||
h, w = img.shape[:2]
|
||||
center = (w // 2, h // 2)
|
||||
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
|
||||
corrected = cv2.warpAffine(img, M, (w, h),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
|
||||
logger.info(f"Deskew: corrected {median_angle:.2f}\u00b0 rotation")
|
||||
return corrected, median_angle
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Deskew via Word Alignment
|
||||
# =============================================================================
|
||||
|
||||
def deskew_image_by_word_alignment(
|
||||
image_data: bytes,
|
||||
lang: str = "eng+deu",
|
||||
downscale_factor: float = 0.5,
|
||||
) -> Tuple[bytes, float]:
|
||||
"""Correct rotation by fitting a line through left-most word starts per text line.
|
||||
|
||||
More robust than Hough-based deskew for vocabulary worksheets where text lines
|
||||
have consistent left-alignment.
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes (PNG/JPEG).
|
||||
lang: Tesseract language string for the quick pass.
|
||||
downscale_factor: Shrink factor for the quick Tesseract pass (0.5 = 50%).
|
||||
|
||||
Returns:
|
||||
Tuple of (rotated image as PNG bytes, detected angle in degrees).
|
||||
"""
|
||||
if not CV2_AVAILABLE or not TESSERACT_AVAILABLE:
|
||||
return image_data, 0.0
|
||||
|
||||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||||
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||
if img is None:
|
||||
logger.warning("deskew_by_word_alignment: could not decode image")
|
||||
return image_data, 0.0
|
||||
|
||||
orig_h, orig_w = img.shape[:2]
|
||||
|
||||
small_w = int(orig_w * downscale_factor)
|
||||
small_h = int(orig_h * downscale_factor)
|
||||
small = cv2.resize(img, (small_w, small_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
pil_small = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
|
||||
try:
|
||||
data = pytesseract.image_to_data(
|
||||
pil_small, lang=lang, config="--psm 6 --oem 3",
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"deskew_by_word_alignment: Tesseract failed: {e}")
|
||||
return image_data, 0.0
|
||||
|
||||
line_groups: Dict[tuple, list] = defaultdict(list)
|
||||
for i in range(len(data["text"])):
|
||||
text = (data["text"][i] or "").strip()
|
||||
conf = int(data["conf"][i])
|
||||
if not text or conf < 20:
|
||||
continue
|
||||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||||
line_groups[key].append(i)
|
||||
|
||||
if len(line_groups) < 5:
|
||||
logger.info(f"deskew_by_word_alignment: only {len(line_groups)} lines, skipping")
|
||||
return image_data, 0.0
|
||||
|
||||
scale = 1.0 / downscale_factor
|
||||
points = []
|
||||
for key, indices in line_groups.items():
|
||||
best_idx = min(indices, key=lambda i: data["left"][i])
|
||||
lx = data["left"][best_idx] * scale
|
||||
top = data["top"][best_idx] * scale
|
||||
h = data["height"][best_idx] * scale
|
||||
cy = top + h / 2.0
|
||||
points.append((lx, cy))
|
||||
|
||||
xs = np.array([p[0] for p in points])
|
||||
ys = np.array([p[1] for p in points])
|
||||
median_x = float(np.median(xs))
|
||||
tolerance = orig_w * 0.03
|
||||
|
||||
mask = np.abs(xs - median_x) <= tolerance
|
||||
filtered_xs = xs[mask]
|
||||
filtered_ys = ys[mask]
|
||||
|
||||
if len(filtered_xs) < 5:
|
||||
logger.info(f"deskew_by_word_alignment: only {len(filtered_xs)} aligned points after filter, skipping")
|
||||
return image_data, 0.0
|
||||
|
||||
coeffs = np.polyfit(filtered_ys, filtered_xs, 1)
|
||||
slope = coeffs[0]
|
||||
angle_rad = np.arctan(slope)
|
||||
angle_deg = float(np.degrees(angle_rad))
|
||||
|
||||
angle_deg = max(-5.0, min(5.0, angle_deg))
|
||||
|
||||
logger.info(f"deskew_by_word_alignment: detected {angle_deg:.2f}\u00b0 from {len(filtered_xs)} points "
|
||||
f"(total lines: {len(line_groups)})")
|
||||
|
||||
if abs(angle_deg) < 0.05:
|
||||
return image_data, 0.0
|
||||
|
||||
center = (orig_w // 2, orig_h // 2)
|
||||
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||||
rotated = cv2.warpAffine(img, M, (orig_w, orig_h),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
|
||||
success, png_buf = cv2.imencode(".png", rotated)
|
||||
if not success:
|
||||
logger.warning("deskew_by_word_alignment: PNG encoding failed")
|
||||
return image_data, 0.0
|
||||
|
||||
return png_buf.tobytes(), angle_deg
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Projection Gradient Scoring
|
||||
# =============================================================================
|
||||
|
||||
def _projection_gradient_score(profile: np.ndarray) -> float:
|
||||
"""Score a projection profile by the L2-norm of its first derivative."""
|
||||
diff = np.diff(profile)
|
||||
return float(np.sum(diff * diff))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Iterative Deskew (Vertical-Edge Projection)
|
||||
# =============================================================================
|
||||
|
||||
def deskew_image_iterative(
|
||||
img: np.ndarray,
|
||||
coarse_range: float = 5.0,
|
||||
coarse_step: float = 0.1,
|
||||
fine_range: float = 0.15,
|
||||
fine_step: float = 0.02,
|
||||
) -> Tuple[np.ndarray, float, Dict[str, Any]]:
|
||||
"""Iterative deskew using vertical-edge projection optimisation.
|
||||
|
||||
Args:
|
||||
img: BGR image (full resolution).
|
||||
coarse_range: half-range in degrees for the coarse sweep.
|
||||
coarse_step: step size in degrees for the coarse sweep.
|
||||
fine_range: half-range around the coarse winner for the fine sweep.
|
||||
fine_step: step size in degrees for the fine sweep.
|
||||
|
||||
Returns:
|
||||
(rotated_bgr, angle_degrees, debug_dict)
|
||||
"""
|
||||
h, w = img.shape[:2]
|
||||
debug: Dict[str, Any] = {}
|
||||
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
y_lo, y_hi = int(h * 0.15), int(h * 0.85)
|
||||
x_lo, x_hi = int(w * 0.10), int(w * 0.90)
|
||||
gray_crop = gray[y_lo:y_hi, x_lo:x_hi]
|
||||
|
||||
sobel_x = cv2.Sobel(gray_crop, cv2.CV_64F, 1, 0, ksize=3)
|
||||
edges = np.abs(sobel_x)
|
||||
edge_max = edges.max()
|
||||
if edge_max > 0:
|
||||
edges = (edges / edge_max * 255).astype(np.uint8)
|
||||
else:
|
||||
return img, 0.0, {"error": "no edges detected"}
|
||||
|
||||
crop_h, crop_w = edges.shape[:2]
|
||||
crop_center = (crop_w // 2, crop_h // 2)
|
||||
|
||||
trim_y = max(4, int(crop_h * 0.03))
|
||||
trim_x = max(4, int(crop_w * 0.03))
|
||||
|
||||
def _sweep_edges(angles: np.ndarray) -> list:
|
||||
results = []
|
||||
for angle in angles:
|
||||
if abs(angle) < 1e-6:
|
||||
rotated = edges
|
||||
else:
|
||||
M = cv2.getRotationMatrix2D(crop_center, angle, 1.0)
|
||||
rotated = cv2.warpAffine(edges, M, (crop_w, crop_h),
|
||||
flags=cv2.INTER_NEAREST,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
trimmed = rotated[trim_y:-trim_y, trim_x:-trim_x]
|
||||
v_profile = np.sum(trimmed, axis=0, dtype=np.float64)
|
||||
score = _projection_gradient_score(v_profile)
|
||||
results.append((float(angle), score))
|
||||
return results
|
||||
|
||||
coarse_angles = np.arange(-coarse_range, coarse_range + coarse_step * 0.5, coarse_step)
|
||||
coarse_results = _sweep_edges(coarse_angles)
|
||||
best_coarse = max(coarse_results, key=lambda x: x[1])
|
||||
best_coarse_angle, best_coarse_score = best_coarse
|
||||
|
||||
debug["coarse_best_angle"] = round(best_coarse_angle, 2)
|
||||
debug["coarse_best_score"] = round(best_coarse_score, 1)
|
||||
debug["coarse_scores"] = [(round(a, 2), round(s, 1)) for a, s in coarse_results]
|
||||
|
||||
fine_lo = best_coarse_angle - fine_range
|
||||
fine_hi = best_coarse_angle + fine_range
|
||||
fine_angles = np.arange(fine_lo, fine_hi + fine_step * 0.5, fine_step)
|
||||
fine_results = _sweep_edges(fine_angles)
|
||||
best_fine = max(fine_results, key=lambda x: x[1])
|
||||
best_fine_angle, best_fine_score = best_fine
|
||||
|
||||
debug["fine_best_angle"] = round(best_fine_angle, 2)
|
||||
debug["fine_best_score"] = round(best_fine_score, 1)
|
||||
debug["fine_scores"] = [(round(a, 2), round(s, 1)) for a, s in fine_results]
|
||||
|
||||
final_angle = best_fine_angle
|
||||
final_angle = max(-5.0, min(5.0, final_angle))
|
||||
|
||||
logger.info(f"deskew_iterative: coarse={best_coarse_angle:.2f}\u00b0 fine={best_fine_angle:.2f}\u00b0 -> {final_angle:.2f}\u00b0")
|
||||
|
||||
if abs(final_angle) < 0.05:
|
||||
return img, 0.0, debug
|
||||
|
||||
center = (w // 2, h // 2)
|
||||
M = cv2.getRotationMatrix2D(center, final_angle, 1.0)
|
||||
rotated = cv2.warpAffine(img, M, (w, h),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
|
||||
return rotated, final_angle, debug
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Text-Line Slope Measurement
|
||||
# =============================================================================
|
||||
|
||||
def _measure_textline_slope(img: np.ndarray) -> float:
|
||||
"""Measure residual text-line slope via Tesseract word-position regression."""
|
||||
import math as _math
|
||||
|
||||
if not TESSERACT_AVAILABLE or not CV2_AVAILABLE:
|
||||
return 0.0
|
||||
|
||||
h, w = img.shape[:2]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
data = pytesseract.image_to_data(
|
||||
Image.fromarray(gray),
|
||||
output_type=pytesseract.Output.DICT,
|
||||
config="--psm 6",
|
||||
)
|
||||
|
||||
lines: Dict[tuple, list] = {}
|
||||
for i in range(len(data["text"])):
|
||||
txt = (data["text"][i] or "").strip()
|
||||
if len(txt) < 2 or int(data["conf"][i]) < 30:
|
||||
continue
|
||||
key = (data["block_num"][i], data["par_num"][i], data["line_num"][i])
|
||||
cx = data["left"][i] + data["width"][i] / 2.0
|
||||
cy = data["top"][i] + data["height"][i] / 2.0
|
||||
lines.setdefault(key, []).append((cx, cy))
|
||||
|
||||
slopes: list = []
|
||||
for pts in lines.values():
|
||||
if len(pts) < 3:
|
||||
continue
|
||||
pts.sort(key=lambda p: p[0])
|
||||
xs = np.array([p[0] for p in pts], dtype=np.float64)
|
||||
ys = np.array([p[1] for p in pts], dtype=np.float64)
|
||||
if xs[-1] - xs[0] < w * 0.15:
|
||||
continue
|
||||
A = np.vstack([xs, np.ones_like(xs)]).T
|
||||
result = np.linalg.lstsq(A, ys, rcond=None)
|
||||
slope = result[0][0]
|
||||
slopes.append(_math.degrees(_math.atan(slope)))
|
||||
|
||||
if len(slopes) < 3:
|
||||
return 0.0
|
||||
|
||||
slopes.sort()
|
||||
trim = max(1, len(slopes) // 10)
|
||||
trimmed = slopes[trim:-trim] if len(slopes) > 2 * trim else slopes
|
||||
if not trimmed:
|
||||
return 0.0
|
||||
|
||||
return sum(trimmed) / len(trimmed)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Two-Pass Deskew
|
||||
# =============================================================================
|
||||
|
||||
def deskew_two_pass(
|
||||
img: np.ndarray,
|
||||
coarse_range: float = 5.0,
|
||||
) -> Tuple[np.ndarray, float, Dict[str, Any]]:
|
||||
"""Two-pass deskew: iterative projection + word-alignment residual check.
|
||||
|
||||
Returns:
|
||||
(corrected_bgr, total_angle_degrees, debug_dict)
|
||||
"""
|
||||
debug: Dict[str, Any] = {}
|
||||
|
||||
# --- Pass 1: iterative projection ---
|
||||
corrected, angle1, dbg1 = deskew_image_iterative(
|
||||
img.copy(), coarse_range=coarse_range,
|
||||
)
|
||||
debug["pass1_angle"] = round(angle1, 3)
|
||||
debug["pass1_method"] = "iterative"
|
||||
debug["pass1_debug"] = dbg1
|
||||
|
||||
# --- Pass 2: word-alignment residual check ---
|
||||
angle2 = 0.0
|
||||
try:
|
||||
ok, buf = cv2.imencode(".png", corrected)
|
||||
if ok:
|
||||
corrected_bytes, angle2 = deskew_image_by_word_alignment(buf.tobytes())
|
||||
if abs(angle2) >= 0.3:
|
||||
arr2 = np.frombuffer(corrected_bytes, dtype=np.uint8)
|
||||
corrected2 = cv2.imdecode(arr2, cv2.IMREAD_COLOR)
|
||||
if corrected2 is not None:
|
||||
corrected = corrected2
|
||||
logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 applied "
|
||||
f"(total={angle1 + angle2:.2f}\u00b0)")
|
||||
else:
|
||||
angle2 = 0.0
|
||||
else:
|
||||
logger.info(f"deskew_two_pass: pass2 residual={angle2:.2f}\u00b0 < 0.3\u00b0 -- skipped")
|
||||
angle2 = 0.0
|
||||
except Exception as e:
|
||||
logger.warning(f"deskew_two_pass: pass2 word-alignment failed: {e}")
|
||||
angle2 = 0.0
|
||||
|
||||
# --- Pass 3: Tesseract text-line regression residual check ---
|
||||
angle3 = 0.0
|
||||
try:
|
||||
residual = _measure_textline_slope(corrected)
|
||||
debug["pass3_raw"] = round(residual, 3)
|
||||
if abs(residual) >= 0.3:
|
||||
h3, w3 = corrected.shape[:2]
|
||||
center3 = (w3 // 2, h3 // 2)
|
||||
M3 = cv2.getRotationMatrix2D(center3, residual, 1.0)
|
||||
corrected = cv2.warpAffine(
|
||||
corrected, M3, (w3, h3),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE,
|
||||
)
|
||||
angle3 = residual
|
||||
logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 applied", residual)
|
||||
else:
|
||||
logger.info("deskew_two_pass: pass3 text-line residual=%.2f\u00b0 < 0.3\u00b0 -- skipped", residual)
|
||||
except Exception as e:
|
||||
logger.warning("deskew_two_pass: pass3 text-line check failed: %s", e)
|
||||
|
||||
total_angle = angle1 + angle2 + angle3
|
||||
debug["pass2_angle"] = round(angle2, 3)
|
||||
debug["pass2_method"] = "word_alignment"
|
||||
debug["pass3_angle"] = round(angle3, 3)
|
||||
debug["pass3_method"] = "textline_regression"
|
||||
debug["total_angle"] = round(total_angle, 3)
|
||||
|
||||
logger.info(
|
||||
"deskew_two_pass: pass1=%.2f\u00b0 + pass2=%.2f\u00b0 + pass3=%.2f\u00b0 = %.2f\u00b0",
|
||||
angle1, angle2, angle3, total_angle,
|
||||
)
|
||||
|
||||
return corrected, total_angle, debug
|
||||
@@ -0,0 +1,474 @@
|
||||
"""
|
||||
CV Preprocessing Dewarp — Vertical shear detection and correction.
|
||||
|
||||
Provides four shear detection methods (vertical edge, projection variance,
|
||||
Hough lines, text-line drift), ensemble combination, quality gating,
|
||||
and the main dewarp_image() function.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
CV2_AVAILABLE,
|
||||
TESSERACT_AVAILABLE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Shear Detection Methods
|
||||
# =============================================================================
|
||||
|
||||
def _detect_shear_angle(img: np.ndarray) -> Dict[str, Any]:
|
||||
"""Detect vertical shear angle via strongest vertical edge tracking (Method A)."""
|
||||
h, w = img.shape[:2]
|
||||
result = {"method": "vertical_edge", "shear_degrees": 0.0, "confidence": 0.0}
|
||||
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
||||
abs_sobel = np.abs(sobel_x).astype(np.uint8)
|
||||
|
||||
_, binary = cv2.threshold(abs_sobel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||
|
||||
num_strips = 20
|
||||
strip_h = h // num_strips
|
||||
edge_positions = []
|
||||
|
||||
for i in range(num_strips):
|
||||
y_start = i * strip_h
|
||||
y_end = min((i + 1) * strip_h, h)
|
||||
strip = binary[y_start:y_end, :]
|
||||
|
||||
projection = np.sum(strip, axis=0).astype(np.float64)
|
||||
if projection.max() == 0:
|
||||
continue
|
||||
|
||||
search_w = int(w * 0.4)
|
||||
left_proj = projection[:search_w]
|
||||
if left_proj.max() == 0:
|
||||
continue
|
||||
|
||||
kernel_size = max(3, w // 100)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
smoothed = cv2.GaussianBlur(left_proj.reshape(1, -1), (kernel_size, 1), 0).flatten()
|
||||
x_pos = float(np.argmax(smoothed))
|
||||
y_center = (y_start + y_end) / 2.0
|
||||
edge_positions.append((y_center, x_pos))
|
||||
|
||||
if len(edge_positions) < 8:
|
||||
return result
|
||||
|
||||
ys = np.array([p[0] for p in edge_positions])
|
||||
xs = np.array([p[1] for p in edge_positions])
|
||||
|
||||
median_x = np.median(xs)
|
||||
std_x = max(np.std(xs), 1.0)
|
||||
mask = np.abs(xs - median_x) < 2 * std_x
|
||||
ys = ys[mask]
|
||||
xs = xs[mask]
|
||||
|
||||
if len(ys) < 6:
|
||||
return result
|
||||
|
||||
straight_coeffs = np.polyfit(ys, xs, 1)
|
||||
slope = straight_coeffs[0]
|
||||
fitted = np.polyval(straight_coeffs, ys)
|
||||
residuals = xs - fitted
|
||||
rmse = float(np.sqrt(np.mean(residuals ** 2)))
|
||||
|
||||
shear_degrees = math.degrees(math.atan(slope))
|
||||
|
||||
confidence = min(1.0, len(ys) / 15.0) * max(0.5, 1.0 - rmse / 5.0)
|
||||
|
||||
result["shear_degrees"] = round(shear_degrees, 3)
|
||||
result["confidence"] = round(float(confidence), 2)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _detect_shear_by_projection(img: np.ndarray) -> Dict[str, Any]:
|
||||
"""Detect shear angle by maximising variance of horizontal text-line projections (Method B)."""
|
||||
result = {"method": "projection", "shear_degrees": 0.0, "confidence": 0.0}
|
||||
|
||||
h, w = img.shape[:2]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
small = cv2.resize(binary, (w // 2, h // 2), interpolation=cv2.INTER_AREA)
|
||||
sh, sw = small.shape
|
||||
|
||||
def _sweep_variance(angles_list):
|
||||
results = []
|
||||
for angle_deg in angles_list:
|
||||
if abs(angle_deg) < 0.001:
|
||||
rotated = small
|
||||
else:
|
||||
shear_tan = math.tan(math.radians(angle_deg))
|
||||
M = np.float32([[1, shear_tan, -sh / 2.0 * shear_tan], [0, 1, 0]])
|
||||
rotated = cv2.warpAffine(small, M, (sw, sh),
|
||||
flags=cv2.INTER_NEAREST,
|
||||
borderMode=cv2.BORDER_CONSTANT)
|
||||
profile = np.sum(rotated, axis=1).astype(float)
|
||||
results.append((angle_deg, float(np.var(profile))))
|
||||
return results
|
||||
|
||||
coarse_angles = [a * 0.5 for a in range(-6, 7)]
|
||||
coarse_results = _sweep_variance(coarse_angles)
|
||||
coarse_best = max(coarse_results, key=lambda x: x[1])
|
||||
|
||||
fine_center = coarse_best[0]
|
||||
fine_angles = [fine_center + a * 0.05 for a in range(-10, 11)]
|
||||
fine_results = _sweep_variance(fine_angles)
|
||||
fine_best = max(fine_results, key=lambda x: x[1])
|
||||
|
||||
best_angle = fine_best[0]
|
||||
best_variance = fine_best[1]
|
||||
variances = coarse_results + fine_results
|
||||
|
||||
all_mean = sum(v for _, v in variances) / len(variances)
|
||||
if all_mean > 0 and best_variance > all_mean:
|
||||
confidence = min(1.0, (best_variance - all_mean) / (all_mean + 1.0) * 0.6)
|
||||
else:
|
||||
confidence = 0.0
|
||||
|
||||
result["shear_degrees"] = round(best_angle, 3)
|
||||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||||
return result
|
||||
|
||||
|
||||
def _detect_shear_by_hough(img: np.ndarray) -> Dict[str, Any]:
|
||||
"""Detect shear using Hough transform on printed table / ruled lines (Method C)."""
|
||||
result = {"method": "hough_lines", "shear_degrees": 0.0, "confidence": 0.0}
|
||||
|
||||
h, w = img.shape[:2]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||
|
||||
min_len = int(w * 0.15)
|
||||
lines = cv2.HoughLinesP(
|
||||
edges, rho=1, theta=np.pi / 360,
|
||||
threshold=int(w * 0.08),
|
||||
minLineLength=min_len,
|
||||
maxLineGap=20,
|
||||
)
|
||||
|
||||
if lines is None or len(lines) < 3:
|
||||
return result
|
||||
|
||||
horizontal_angles: List[Tuple[float, float]] = []
|
||||
for line in lines:
|
||||
x1, y1, x2, y2 = line[0]
|
||||
if x1 == x2:
|
||||
continue
|
||||
angle = float(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
|
||||
if abs(angle) <= 5.0:
|
||||
length = float(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2))
|
||||
horizontal_angles.append((angle, length))
|
||||
|
||||
if len(horizontal_angles) < 3:
|
||||
return result
|
||||
|
||||
angles_arr = np.array([a for a, _ in horizontal_angles])
|
||||
weights_arr = np.array([l for _, l in horizontal_angles])
|
||||
sorted_idx = np.argsort(angles_arr)
|
||||
s_angles = angles_arr[sorted_idx]
|
||||
s_weights = weights_arr[sorted_idx]
|
||||
cum = np.cumsum(s_weights)
|
||||
mid_idx = int(np.searchsorted(cum, cum[-1] / 2.0))
|
||||
median_angle = float(s_angles[min(mid_idx, len(s_angles) - 1)])
|
||||
|
||||
agree = sum(1 for a, _ in horizontal_angles if abs(a - median_angle) < 1.0)
|
||||
confidence = min(1.0, agree / max(len(horizontal_angles), 1)) * 0.85
|
||||
|
||||
shear_degrees = -median_angle
|
||||
|
||||
result["shear_degrees"] = round(shear_degrees, 3)
|
||||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||||
return result
|
||||
|
||||
|
||||
def _detect_shear_by_text_lines(img: np.ndarray) -> Dict[str, Any]:
|
||||
"""Detect shear by measuring text-line straightness (Method D)."""
|
||||
result = {"method": "text_lines", "shear_degrees": 0.0, "confidence": 0.0}
|
||||
|
||||
h, w = img.shape[:2]
|
||||
scale = 0.5
|
||||
small = cv2.resize(img, (int(w * scale), int(h * scale)),
|
||||
interpolation=cv2.INTER_AREA)
|
||||
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
|
||||
pil_img = Image.fromarray(gray)
|
||||
|
||||
try:
|
||||
data = pytesseract.image_to_data(
|
||||
pil_img, lang='eng+deu', config='--psm 11 --oem 3',
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
except Exception:
|
||||
return result
|
||||
|
||||
words = []
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 20 or len(text) < 2:
|
||||
continue
|
||||
left_x = float(data['left'][i])
|
||||
cy = data['top'][i] + data['height'][i] / 2.0
|
||||
word_w = float(data['width'][i])
|
||||
words.append((left_x, cy, word_w))
|
||||
|
||||
if len(words) < 15:
|
||||
return result
|
||||
|
||||
avg_w = sum(ww for _, _, ww in words) / len(words)
|
||||
x_tol = max(avg_w * 0.4, 8)
|
||||
|
||||
words_by_x = sorted(words, key=lambda w: w[0])
|
||||
columns: List[List[Tuple[float, float]]] = []
|
||||
cur_col: List[Tuple[float, float]] = [(words_by_x[0][0], words_by_x[0][1])]
|
||||
cur_x = words_by_x[0][0]
|
||||
|
||||
for lx, cy, _ in words_by_x[1:]:
|
||||
if abs(lx - cur_x) <= x_tol:
|
||||
cur_col.append((lx, cy))
|
||||
cur_x = cur_x * 0.8 + lx * 0.2
|
||||
else:
|
||||
if len(cur_col) >= 5:
|
||||
columns.append(cur_col)
|
||||
cur_col = [(lx, cy)]
|
||||
cur_x = lx
|
||||
if len(cur_col) >= 5:
|
||||
columns.append(cur_col)
|
||||
|
||||
if len(columns) < 2:
|
||||
return result
|
||||
|
||||
drifts = []
|
||||
for col in columns:
|
||||
ys = np.array([p[1] for p in col])
|
||||
xs = np.array([p[0] for p in col])
|
||||
y_range = ys.max() - ys.min()
|
||||
if y_range < h * scale * 0.3:
|
||||
continue
|
||||
coeffs = np.polyfit(ys, xs, 1)
|
||||
drifts.append(coeffs[0])
|
||||
|
||||
if len(drifts) < 2:
|
||||
return result
|
||||
|
||||
median_drift = float(np.median(drifts))
|
||||
shear_degrees = math.degrees(math.atan(median_drift))
|
||||
|
||||
drift_std = float(np.std(drifts))
|
||||
consistency = max(0.0, 1.0 - drift_std * 50)
|
||||
count_factor = min(1.0, len(drifts) / 4.0)
|
||||
confidence = count_factor * 0.5 + consistency * 0.5
|
||||
|
||||
result["shear_degrees"] = round(shear_degrees, 3)
|
||||
result["confidence"] = round(max(0.0, min(1.0, confidence)), 2)
|
||||
logger.info("text_lines(v2): %d columns, %d drifts, median=%.4f, "
|
||||
"shear=%.3f\u00b0, conf=%.2f",
|
||||
len(columns), len(drifts), median_drift,
|
||||
shear_degrees, confidence)
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Quality Check and Shear Application
|
||||
# =============================================================================
|
||||
|
||||
def _dewarp_quality_check(original: np.ndarray, corrected: np.ndarray) -> bool:
|
||||
"""Check whether the dewarp correction actually improved alignment."""
|
||||
def _h_proj_variance(img: np.ndarray) -> float:
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
_, binary = cv2.threshold(gray, 0, 255,
|
||||
cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
small = cv2.resize(binary, (binary.shape[1] // 2, binary.shape[0] // 2),
|
||||
interpolation=cv2.INTER_AREA)
|
||||
profile = np.sum(small, axis=1).astype(float)
|
||||
return float(np.var(profile))
|
||||
|
||||
var_before = _h_proj_variance(original)
|
||||
var_after = _h_proj_variance(corrected)
|
||||
|
||||
return var_after > var_before
|
||||
|
||||
|
||||
def _apply_shear(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||||
"""Apply a vertical shear correction to an image."""
|
||||
h, w = img.shape[:2]
|
||||
shear_tan = math.tan(math.radians(shear_degrees))
|
||||
|
||||
M = np.float32([
|
||||
[1, shear_tan, -h / 2.0 * shear_tan],
|
||||
[0, 1, 0],
|
||||
])
|
||||
|
||||
corrected = cv2.warpAffine(img, M, (w, h),
|
||||
flags=cv2.INTER_LINEAR,
|
||||
borderMode=cv2.BORDER_REPLICATE)
|
||||
return corrected
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Ensemble Shear Combination
|
||||
# =============================================================================
|
||||
|
||||
def _ensemble_shear(detections: List[Dict[str, Any]]) -> Tuple[float, float, str]:
|
||||
"""Combine multiple shear detections into a single weighted estimate (v2)."""
|
||||
_MIN_CONF = 0.35
|
||||
_METHOD_WEIGHT_BOOST = {"text_lines": 1.5}
|
||||
|
||||
accepted = []
|
||||
for d in detections:
|
||||
if d["confidence"] < _MIN_CONF:
|
||||
continue
|
||||
boost = _METHOD_WEIGHT_BOOST.get(d["method"], 1.0)
|
||||
effective_conf = d["confidence"] * boost
|
||||
accepted.append((d["shear_degrees"], effective_conf, d["method"]))
|
||||
|
||||
if not accepted:
|
||||
return 0.0, 0.0, "none"
|
||||
|
||||
if len(accepted) == 1:
|
||||
deg, conf, method = accepted[0]
|
||||
return deg, min(conf, 1.0), method
|
||||
|
||||
total_w = sum(c for _, c, _ in accepted)
|
||||
w_mean = sum(d * c for d, c, _ in accepted) / total_w
|
||||
|
||||
filtered = [(d, c, m) for d, c, m in accepted if abs(d - w_mean) <= 1.0]
|
||||
if not filtered:
|
||||
filtered = accepted
|
||||
|
||||
total_w2 = sum(c for _, c, _ in filtered)
|
||||
final_deg = sum(d * c for d, c, _ in filtered) / total_w2
|
||||
|
||||
avg_conf = total_w2 / len(filtered)
|
||||
spread = max(d for d, _, _ in filtered) - min(d for d, _, _ in filtered)
|
||||
agreement_bonus = 0.15 if spread < 0.5 else 0.0
|
||||
ensemble_conf = min(1.0, avg_conf + agreement_bonus)
|
||||
|
||||
methods_str = "+".join(m for _, _, m in filtered)
|
||||
return round(final_deg, 3), round(min(ensemble_conf, 1.0), 2), methods_str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Dewarp Function
|
||||
# =============================================================================
|
||||
|
||||
def dewarp_image(img: np.ndarray, use_ensemble: bool = True) -> Tuple[np.ndarray, Dict[str, Any]]:
|
||||
"""Correct vertical shear after deskew (v2 with quality gate).
|
||||
|
||||
Methods (all run in ~150ms total):
|
||||
A. _detect_shear_angle() -- vertical edge profile (~50ms)
|
||||
B. _detect_shear_by_projection() -- horizontal text-line variance (~30ms)
|
||||
C. _detect_shear_by_hough() -- Hough lines on table borders (~20ms)
|
||||
D. _detect_shear_by_text_lines() -- text-line straightness (~50ms)
|
||||
|
||||
Args:
|
||||
img: BGR image (already deskewed).
|
||||
use_ensemble: If False, fall back to single-method behaviour (method A only).
|
||||
|
||||
Returns:
|
||||
Tuple of (corrected_image, dewarp_info).
|
||||
"""
|
||||
no_correction = {
|
||||
"method": "none",
|
||||
"shear_degrees": 0.0,
|
||||
"confidence": 0.0,
|
||||
"detections": [],
|
||||
}
|
||||
|
||||
if not CV2_AVAILABLE:
|
||||
return img, no_correction
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
if use_ensemble:
|
||||
det_a = _detect_shear_angle(img)
|
||||
det_b = _detect_shear_by_projection(img)
|
||||
det_c = _detect_shear_by_hough(img)
|
||||
det_d = _detect_shear_by_text_lines(img)
|
||||
detections = [det_a, det_b, det_c, det_d]
|
||||
shear_deg, confidence, method = _ensemble_shear(detections)
|
||||
else:
|
||||
det_a = _detect_shear_angle(img)
|
||||
detections = [det_a]
|
||||
shear_deg = det_a["shear_degrees"]
|
||||
confidence = det_a["confidence"]
|
||||
method = det_a["method"]
|
||||
|
||||
duration = time.time() - t0
|
||||
|
||||
logger.info(
|
||||
"dewarp: ensemble shear=%.3f\u00b0 conf=%.2f method=%s (%.2fs) | "
|
||||
"A=%.3f/%.2f B=%.3f/%.2f C=%.3f/%.2f D=%.3f/%.2f",
|
||||
shear_deg, confidence, method, duration,
|
||||
detections[0]["shear_degrees"], detections[0]["confidence"],
|
||||
detections[1]["shear_degrees"] if len(detections) > 1 else 0.0,
|
||||
detections[1]["confidence"] if len(detections) > 1 else 0.0,
|
||||
detections[2]["shear_degrees"] if len(detections) > 2 else 0.0,
|
||||
detections[2]["confidence"] if len(detections) > 2 else 0.0,
|
||||
detections[3]["shear_degrees"] if len(detections) > 3 else 0.0,
|
||||
detections[3]["confidence"] if len(detections) > 3 else 0.0,
|
||||
)
|
||||
|
||||
_all_detections = [
|
||||
{"method": d["method"], "shear_degrees": d["shear_degrees"],
|
||||
"confidence": d["confidence"]}
|
||||
for d in detections
|
||||
]
|
||||
|
||||
if abs(shear_deg) < 0.08 or confidence < 0.4:
|
||||
no_correction["detections"] = _all_detections
|
||||
return img, no_correction
|
||||
|
||||
corrected = _apply_shear(img, -shear_deg)
|
||||
|
||||
if abs(shear_deg) >= 0.5 and not _dewarp_quality_check(img, corrected):
|
||||
logger.info("dewarp: quality gate REJECTED correction (%.3f\u00b0) -- "
|
||||
"projection variance did not improve", shear_deg)
|
||||
no_correction["detections"] = _all_detections
|
||||
return img, no_correction
|
||||
|
||||
info = {
|
||||
"method": method,
|
||||
"shear_degrees": shear_deg,
|
||||
"confidence": confidence,
|
||||
"detections": _all_detections,
|
||||
}
|
||||
|
||||
return corrected, info
|
||||
|
||||
|
||||
def dewarp_image_manual(img: np.ndarray, shear_degrees: float) -> np.ndarray:
|
||||
"""Apply shear correction with a manual angle."""
|
||||
if abs(shear_degrees) < 0.001:
|
||||
return img
|
||||
return _apply_shear(img, -shear_degrees)
|
||||
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Image I/O, orientation detection, deskew, and dewarp for the CV vocabulary pipeline.
|
||||
|
||||
Re-export facade -- all logic lives in the sub-modules:
|
||||
|
||||
cv_preprocessing_deskew Rotation correction (Hough, word-alignment, iterative, two-pass)
|
||||
cv_preprocessing_dewarp Vertical shear detection and correction (4 methods + ensemble)
|
||||
|
||||
This file contains the image I/O and orientation detection functions.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
CV2_AVAILABLE,
|
||||
TESSERACT_AVAILABLE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Guarded imports
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
# Re-export all deskew functions
|
||||
from cv_preprocessing_deskew import ( # noqa: F401
|
||||
deskew_image,
|
||||
deskew_image_by_word_alignment,
|
||||
deskew_image_iterative,
|
||||
deskew_two_pass,
|
||||
_projection_gradient_score,
|
||||
_measure_textline_slope,
|
||||
)
|
||||
|
||||
# Re-export all dewarp functions
|
||||
from cv_preprocessing_dewarp import ( # noqa: F401
|
||||
_apply_shear,
|
||||
_detect_shear_angle,
|
||||
_detect_shear_by_hough,
|
||||
_detect_shear_by_projection,
|
||||
_detect_shear_by_text_lines,
|
||||
_dewarp_quality_check,
|
||||
_ensemble_shear,
|
||||
dewarp_image,
|
||||
dewarp_image_manual,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Image I/O
|
||||
# =============================================================================
|
||||
|
||||
def render_pdf_high_res(pdf_data: bytes, page_number: int = 0, zoom: float = 3.0) -> np.ndarray:
|
||||
"""Render a PDF page to a high-resolution numpy array (BGR).
|
||||
|
||||
Args:
|
||||
pdf_data: Raw PDF bytes.
|
||||
page_number: 0-indexed page number.
|
||||
zoom: Zoom factor (3.0 = 432 DPI).
|
||||
|
||||
Returns:
|
||||
numpy array in BGR format.
|
||||
"""
|
||||
import fitz # PyMuPDF
|
||||
|
||||
pdf_doc = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
if page_number >= pdf_doc.page_count:
|
||||
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_doc.page_count} pages)")
|
||||
|
||||
page = pdf_doc[page_number]
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
|
||||
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
|
||||
if pix.n == 4: # RGBA
|
||||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGBA2BGR)
|
||||
elif pix.n == 3: # RGB
|
||||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
|
||||
else: # Grayscale
|
||||
img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
pdf_doc.close()
|
||||
return img_bgr
|
||||
|
||||
|
||||
def render_image_high_res(image_data: bytes) -> np.ndarray:
|
||||
"""Load an image (PNG/JPEG) into a numpy array (BGR).
|
||||
|
||||
Args:
|
||||
image_data: Raw image bytes.
|
||||
|
||||
Returns:
|
||||
numpy array in BGR format.
|
||||
"""
|
||||
img_array = np.frombuffer(image_data, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||
if img_bgr is None:
|
||||
raise ValueError("Could not decode image data")
|
||||
return img_bgr
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Orientation Detection (0/90/180/270)
|
||||
# =============================================================================
|
||||
|
||||
def detect_and_fix_orientation(img_bgr: np.ndarray) -> Tuple[np.ndarray, int]:
|
||||
"""Detect page orientation via Tesseract OSD and rotate if needed.
|
||||
|
||||
Returns:
|
||||
(corrected_image, rotation_degrees) -- rotation is 0, 90, 180, or 270.
|
||||
"""
|
||||
if pytesseract is None:
|
||||
return img_bgr, 0
|
||||
|
||||
try:
|
||||
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
||||
pil_img = Image.fromarray(gray)
|
||||
|
||||
osd = pytesseract.image_to_osd(pil_img, output_type=pytesseract.Output.DICT)
|
||||
rotate = osd.get("rotate", 0)
|
||||
confidence = osd.get("orientation_conf", 0.0)
|
||||
|
||||
logger.info(f"OSD: orientation={rotate}\u00b0 confidence={confidence:.1f}")
|
||||
|
||||
if rotate == 0 or confidence < 1.0:
|
||||
return img_bgr, 0
|
||||
|
||||
if rotate == 180:
|
||||
corrected = cv2.rotate(img_bgr, cv2.ROTATE_180)
|
||||
elif rotate == 90:
|
||||
corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_CLOCKWISE)
|
||||
elif rotate == 270:
|
||||
corrected = cv2.rotate(img_bgr, cv2.ROTATE_90_COUNTERCLOCKWISE)
|
||||
else:
|
||||
return img_bgr, 0
|
||||
|
||||
logger.info(f"OSD: rotated {rotate}\u00b0 to fix orientation")
|
||||
return corrected, rotate
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"OSD orientation detection failed: {e}")
|
||||
return img_bgr, 0
|
||||
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
CV Review LLM — LLM-based OCR correction: prompt building, change detection, streaming.
|
||||
|
||||
Handles the LLM review path (REVIEW_ENGINE=llm) and shared utilities like
|
||||
_entry_needs_review, _is_spurious_change, _diff_batch, and JSON parsing.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_REVIEW_MODEL = os.getenv("OLLAMA_REVIEW_MODEL", "qwen3:0.6b")
|
||||
_REVIEW_BATCH_SIZE = int(os.getenv("OLLAMA_REVIEW_BATCH_SIZE", "20"))
|
||||
logger.info("LLM review model: %s (batch=%d)", OLLAMA_REVIEW_MODEL, _REVIEW_BATCH_SIZE)
|
||||
|
||||
REVIEW_ENGINE = os.getenv("REVIEW_ENGINE", "spell") # "spell" (default) | "llm"
|
||||
|
||||
# Regex: entry contains IPA phonetic brackets like "dance [da:ns]"
|
||||
_HAS_PHONETIC_RE = re.compile(r'\[.*?[\u02c8\u02cc\u02d0\u0283\u0292\u03b8\u00f0\u014b\u0251\u0252\u0254\u0259\u025c\u026a\u028a\u028c\u00e6].*?\]')
|
||||
|
||||
# Regex: digit adjacent to a letter -- OCR digit<->letter confusion
|
||||
_OCR_DIGIT_IN_WORD_RE = re.compile(r'(?<=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])[01568]|[01568](?=[A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df])')
|
||||
|
||||
|
||||
def _entry_needs_review(entry: Dict) -> bool:
|
||||
"""Check if an entry should be sent for review.
|
||||
|
||||
Sends all non-empty entries that don't have IPA phonetic transcriptions.
|
||||
"""
|
||||
en = entry.get("english", "") or ""
|
||||
de = entry.get("german", "") or ""
|
||||
|
||||
if not en.strip() and not de.strip():
|
||||
return False
|
||||
if _HAS_PHONETIC_RE.search(en) or _HAS_PHONETIC_RE.search(de):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _build_llm_prompt(table_lines: List[Dict]) -> str:
|
||||
"""Build the LLM correction prompt for a batch of entries."""
|
||||
return f"""Du bist ein OCR-Zeichenkorrektur-Werkzeug fuer Vokabeltabellen (Englisch-Deutsch).
|
||||
|
||||
DEINE EINZIGE AUFGABE: Einzelne Zeichen korrigieren, die vom OCR-Scanner als Ziffer statt als Buchstabe erkannt wurden.
|
||||
|
||||
NUR diese Korrekturen sind erlaubt:
|
||||
- Ziffer 8 statt B: "8en" -> "Ben", "8uch" -> "Buch", "8all" -> "Ball"
|
||||
- Ziffer 0 statt O oder o: "L0ndon" -> "London", "0ld" -> "Old"
|
||||
- Ziffer 1 statt l oder I: "1ong" -> "long", "Ber1in" -> "Berlin"
|
||||
- Ziffer 5 statt S oder s: "5tadt" -> "Stadt", "5ee" -> "See"
|
||||
- Ziffer 6 statt G oder g: "6eld" -> "Geld"
|
||||
- Senkrechter Strich | statt I oder l: "| want" -> "I want", "|ong" -> "long", "he| p" -> "help"
|
||||
|
||||
ABSOLUT VERBOTEN -- aendere NIEMALS:
|
||||
- Woerter die korrekt geschrieben sind -- auch wenn du eine andere Schreibweise kennst
|
||||
- Uebersetzungen -- du uebersetzt NICHTS, weder EN->DE noch DE->EN
|
||||
- Korrekte englische Woerter (en-Spalte) -- auch wenn du eine Bedeutung kennst
|
||||
- Korrekte deutsche Woerter (de-Spalte) -- auch wenn du sie anders sagen wuerdest
|
||||
- Eigennamen: Ben, London, China, Africa, Shakespeare usw.
|
||||
- Abkuerzungen: sth., sb., etc., e.g., i.e., v.t., smb. usw.
|
||||
- Lautschrift in eckigen Klammern [...] -- diese NIEMALS beruehren
|
||||
- Beispielsaetze in der ex-Spalte -- NIEMALS aendern
|
||||
|
||||
Wenn ein Wort keinen Ziffer-Buchstaben-Fehler enthaelt: gib es UNVERAENDERT zurueck und setze "corrected": false.
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Kein Text davor oder danach.
|
||||
Behalte die exakte Struktur (gleiche Anzahl Eintraege, gleiche Reihenfolge).
|
||||
|
||||
/no_think
|
||||
|
||||
Eingabe:
|
||||
{json.dumps(table_lines, ensure_ascii=False, indent=2)}"""
|
||||
|
||||
|
||||
def _is_spurious_change(old_val: str, new_val: str) -> bool:
|
||||
"""Detect LLM changes that are likely wrong and should be discarded.
|
||||
|
||||
Only digit<->letter substitutions (0->O, 1->l, 5->S, 6->G, 8->B) are
|
||||
legitimate OCR corrections. Everything else is rejected.
|
||||
"""
|
||||
if not old_val or not new_val:
|
||||
return False
|
||||
|
||||
if old_val.lower() == new_val.lower():
|
||||
return True
|
||||
|
||||
old_words = old_val.split()
|
||||
new_words = new_val.split()
|
||||
if abs(len(old_words) - len(new_words)) > 1:
|
||||
return True
|
||||
|
||||
_OCR_CHAR_MAP = {
|
||||
'0': set('oOgG'),
|
||||
'1': set('lLiI'),
|
||||
'5': set('sS'),
|
||||
'6': set('gG'),
|
||||
'8': set('bB'),
|
||||
'|': set('lLiI1'),
|
||||
'l': set('iI|1'),
|
||||
}
|
||||
has_valid_fix = False
|
||||
if len(old_val) == len(new_val):
|
||||
for oc, nc in zip(old_val, new_val):
|
||||
if oc != nc:
|
||||
if oc in _OCR_CHAR_MAP and nc in _OCR_CHAR_MAP[oc]:
|
||||
has_valid_fix = True
|
||||
elif nc in _OCR_CHAR_MAP and oc in _OCR_CHAR_MAP[nc]:
|
||||
has_valid_fix = True
|
||||
else:
|
||||
_OCR_SUSPICIOUS_RE = re.compile(r'[|01568]')
|
||||
if abs(len(old_val) - len(new_val)) <= 1 and _OCR_SUSPICIOUS_RE.search(old_val):
|
||||
has_valid_fix = True
|
||||
|
||||
if not has_valid_fix:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _diff_batch(originals: List[Dict], corrected: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""Compare original entries with LLM-corrected ones, return (changes, corrected_entries)."""
|
||||
changes = []
|
||||
entries_out = []
|
||||
for i, orig in enumerate(originals):
|
||||
if i < len(corrected):
|
||||
c = corrected[i]
|
||||
entry = dict(orig)
|
||||
for field_name, key in [("english", "en"), ("german", "de"), ("example", "ex")]:
|
||||
new_val = c.get(key, "").strip()
|
||||
old_val = (orig.get(field_name, "") or "").strip()
|
||||
if new_val and new_val != old_val:
|
||||
if _is_spurious_change(old_val, new_val):
|
||||
continue
|
||||
changes.append({
|
||||
"row_index": orig.get("row_index", i),
|
||||
"field": field_name,
|
||||
"old": old_val,
|
||||
"new": new_val,
|
||||
})
|
||||
entry[field_name] = new_val
|
||||
entry["llm_corrected"] = True
|
||||
entries_out.append(entry)
|
||||
else:
|
||||
entries_out.append(dict(orig))
|
||||
return changes, entries_out
|
||||
|
||||
|
||||
def _sanitize_for_json(text: str) -> str:
|
||||
"""Remove or escape control characters that break JSON parsing."""
|
||||
return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)
|
||||
|
||||
|
||||
def _parse_llm_json_array(text: str) -> List[Dict]:
|
||||
"""Extract JSON array from LLM response (handles markdown fences and qwen3 think-tags)."""
|
||||
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'```json\s*', '', text)
|
||||
text = re.sub(r'```\s*', '', text)
|
||||
text = _sanitize_for_json(text)
|
||||
match = re.search(r'\[.*\]', text, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except (ValueError, json.JSONDecodeError) as e:
|
||||
logger.warning("LLM review: JSON parse failed: %s | raw snippet: %.200s", e, match.group()[:200])
|
||||
else:
|
||||
logger.warning("LLM review: no JSON array found in response (%.200s)", text[:200])
|
||||
return []
|
||||
|
||||
|
||||
async def llm_review_entries(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
) -> Dict:
|
||||
"""OCR error correction. Uses spell-checker (REVIEW_ENGINE=spell) or LLM (REVIEW_ENGINE=llm)."""
|
||||
from cv_review_spell import spell_review_entries_sync, _SPELL_AVAILABLE
|
||||
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
return spell_review_entries_sync(entries)
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
reviewable = [(i, e) for i, e in enumerate(entries) if _entry_needs_review(e)]
|
||||
|
||||
if not reviewable:
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": [dict(e) for e in entries],
|
||||
"changes": [],
|
||||
"skipped_count": len(entries),
|
||||
"model_used": model,
|
||||
"duration_ms": 0,
|
||||
}
|
||||
|
||||
review_entries = [e for _, e in reviewable]
|
||||
table_lines = [
|
||||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||||
for e in review_entries
|
||||
]
|
||||
|
||||
logger.info("LLM review: sending %d/%d entries to %s (skipped %d without digit-pattern)",
|
||||
len(review_entries), len(entries), model, len(entries) - len(reviewable))
|
||||
|
||||
prompt = _build_llm_prompt(table_lines)
|
||||
|
||||
t0 = time.time()
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
|
||||
logger.info("LLM review: response in %dms, raw length=%d chars", duration_ms, len(content))
|
||||
|
||||
corrected = _parse_llm_json_array(content)
|
||||
changes, corrected_entries = _diff_batch(review_entries, corrected)
|
||||
|
||||
all_corrected = [dict(e) for e in entries]
|
||||
for batch_idx, (orig_idx, _) in enumerate(reviewable):
|
||||
if batch_idx < len(corrected_entries):
|
||||
all_corrected[orig_idx] = corrected_entries[batch_idx]
|
||||
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": all_corrected,
|
||||
"changes": changes,
|
||||
"skipped_count": len(entries) - len(reviewable),
|
||||
"model_used": model,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
async def llm_review_entries_streaming(
|
||||
entries: List[Dict],
|
||||
model: str = None,
|
||||
batch_size: int = _REVIEW_BATCH_SIZE,
|
||||
):
|
||||
"""Async generator: yield SSE events. Uses spell-checker or LLM depending on REVIEW_ENGINE.
|
||||
|
||||
Phase 0 (always): Run _fix_character_confusion and emit any changes.
|
||||
"""
|
||||
from cv_ocr_engines import _fix_character_confusion
|
||||
from cv_review_spell import spell_review_entries_streaming, _SPELL_AVAILABLE
|
||||
|
||||
_CONF_FIELDS = ('english', 'german', 'example')
|
||||
originals = [{f: e.get(f, '') for f in _CONF_FIELDS} for e in entries]
|
||||
_fix_character_confusion(entries)
|
||||
char_changes = [
|
||||
{'row_index': i, 'field': f, 'old': originals[i][f], 'new': entries[i].get(f, '')}
|
||||
for i in range(len(entries))
|
||||
for f in _CONF_FIELDS
|
||||
if originals[i][f] != entries[i].get(f, '')
|
||||
]
|
||||
|
||||
if REVIEW_ENGINE == "spell" and _SPELL_AVAILABLE:
|
||||
_meta_sent = False
|
||||
async for event in spell_review_entries_streaming(entries, batch_size):
|
||||
yield event
|
||||
if not _meta_sent and event.get('type') == 'meta' and char_changes:
|
||||
_meta_sent = True
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
return
|
||||
|
||||
if REVIEW_ENGINE == "spell" and not _SPELL_AVAILABLE:
|
||||
logger.warning("REVIEW_ENGINE=spell but pyspellchecker not installed, using LLM")
|
||||
|
||||
# LLM path
|
||||
if char_changes:
|
||||
yield {
|
||||
'type': 'batch',
|
||||
'changes': char_changes,
|
||||
'entries_reviewed': sorted({c['row_index'] for c in char_changes}),
|
||||
'progress': {'current': 0, 'total': len(entries)},
|
||||
}
|
||||
|
||||
model = model or OLLAMA_REVIEW_MODEL
|
||||
|
||||
reviewable = []
|
||||
skipped_indices = []
|
||||
for i, e in enumerate(entries):
|
||||
if _entry_needs_review(e):
|
||||
reviewable.append((i, e))
|
||||
else:
|
||||
skipped_indices.append(i)
|
||||
|
||||
total_to_review = len(reviewable)
|
||||
|
||||
yield {
|
||||
"type": "meta",
|
||||
"total_entries": len(entries),
|
||||
"to_review": total_to_review,
|
||||
"skipped": len(skipped_indices),
|
||||
"model": model,
|
||||
"batch_size": batch_size,
|
||||
}
|
||||
|
||||
all_changes = []
|
||||
all_corrected = [dict(e) for e in entries]
|
||||
total_duration_ms = 0
|
||||
reviewed_count = 0
|
||||
|
||||
for batch_start in range(0, total_to_review, batch_size):
|
||||
batch_items = reviewable[batch_start:batch_start + batch_size]
|
||||
batch_entries = [e for _, e in batch_items]
|
||||
|
||||
table_lines = [
|
||||
{"row": e.get("row_index", 0), "en": e.get("english", ""), "de": e.get("german", ""), "ex": e.get("example", "")}
|
||||
for e in batch_entries
|
||||
]
|
||||
|
||||
prompt = _build_llm_prompt(table_lines)
|
||||
|
||||
logger.info("LLM review streaming: batch %d -- sending %d entries to %s",
|
||||
batch_start // batch_size, len(batch_entries), model)
|
||||
|
||||
t0 = time.time()
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
resp = await client.post(
|
||||
f"{_OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"think": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
content = resp.json().get("message", {}).get("content", "")
|
||||
batch_ms = int((time.time() - t0) * 1000)
|
||||
total_duration_ms += batch_ms
|
||||
|
||||
corrected = _parse_llm_json_array(content)
|
||||
batch_changes, batch_corrected = _diff_batch(batch_entries, corrected)
|
||||
|
||||
for batch_idx, (orig_idx, _) in enumerate(batch_items):
|
||||
if batch_idx < len(batch_corrected):
|
||||
all_corrected[orig_idx] = batch_corrected[batch_idx]
|
||||
|
||||
all_changes.extend(batch_changes)
|
||||
reviewed_count += len(batch_items)
|
||||
|
||||
yield {
|
||||
"type": "batch",
|
||||
"batch_index": batch_start // batch_size,
|
||||
"entries_reviewed": [e.get("row_index", 0) for _, e in batch_items],
|
||||
"changes": batch_changes,
|
||||
"duration_ms": batch_ms,
|
||||
"progress": {"current": reviewed_count, "total": total_to_review},
|
||||
}
|
||||
|
||||
yield {
|
||||
"type": "complete",
|
||||
"changes": all_changes,
|
||||
"model_used": model,
|
||||
"duration_ms": total_duration_ms,
|
||||
"total_entries": len(entries),
|
||||
"reviewed": total_to_review,
|
||||
"skipped": len(skipped_indices),
|
||||
"corrections_found": len(all_changes),
|
||||
"entries_corrected": all_corrected,
|
||||
}
|
||||
@@ -0,0 +1,430 @@
|
||||
"""
|
||||
CV Review Pipeline — Multi-pass OCR, line alignment, LLM post-correction, and orchestration.
|
||||
|
||||
Stages 6-8 of the CV vocabulary pipeline plus the main orchestrator.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
CV_PIPELINE_AVAILABLE,
|
||||
PageRegion,
|
||||
PipelineResult,
|
||||
VocabRow,
|
||||
)
|
||||
from cv_preprocessing import (
|
||||
deskew_image,
|
||||
dewarp_image,
|
||||
render_image_high_res,
|
||||
render_pdf_high_res,
|
||||
)
|
||||
from cv_layout import (
|
||||
analyze_layout,
|
||||
create_layout_image,
|
||||
create_ocr_image,
|
||||
)
|
||||
from cv_ocr_engines import (
|
||||
_group_words_into_lines,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 6: Multi-Pass OCR
|
||||
# =============================================================================
|
||||
|
||||
def ocr_region(ocr_img: np.ndarray, region: PageRegion, lang: str,
|
||||
psm: int, fallback_psm: Optional[int] = None,
|
||||
min_confidence: float = 40.0) -> List[Dict[str, Any]]:
|
||||
"""Run Tesseract OCR on a specific region with given PSM.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image.
|
||||
region: Region to crop and OCR.
|
||||
lang: Tesseract language string.
|
||||
psm: Page Segmentation Mode.
|
||||
fallback_psm: If confidence too low, retry with this PSM per line.
|
||||
min_confidence: Minimum average confidence before fallback.
|
||||
|
||||
Returns:
|
||||
List of word dicts with text, position, confidence.
|
||||
"""
|
||||
crop = ocr_img[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
pil_img = Image.fromarray(crop)
|
||||
|
||||
config = f'--psm {psm} --oem 3'
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||||
output_type=pytesseract.Output.DICT)
|
||||
except Exception as e:
|
||||
logger.warning(f"Tesseract failed for region {region.type}: {e}")
|
||||
return []
|
||||
|
||||
words = []
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 10:
|
||||
continue
|
||||
words.append({
|
||||
'text': text,
|
||||
'left': data['left'][i] + region.x,
|
||||
'top': data['top'][i] + region.y,
|
||||
'width': data['width'][i],
|
||||
'height': data['height'][i],
|
||||
'conf': conf,
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
if words and fallback_psm is not None:
|
||||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||||
if avg_conf < min_confidence:
|
||||
logger.info(f"Region {region.type}: avg confidence {avg_conf:.0f}% < {min_confidence}%, "
|
||||
f"trying fallback PSM {fallback_psm}")
|
||||
words = _ocr_region_line_by_line(ocr_img, region, lang, fallback_psm)
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def _ocr_region_line_by_line(ocr_img: np.ndarray, region: PageRegion,
|
||||
lang: str, psm: int) -> List[Dict[str, Any]]:
|
||||
"""OCR a region line by line (fallback for low-confidence regions)."""
|
||||
crop = ocr_img[region.y:region.y + region.height,
|
||||
region.x:region.x + region.width]
|
||||
|
||||
if crop.size == 0:
|
||||
return []
|
||||
|
||||
inv = cv2.bitwise_not(crop)
|
||||
h_proj = np.sum(inv, axis=1)
|
||||
threshold = np.max(h_proj) * 0.05 if np.max(h_proj) > 0 else 0
|
||||
|
||||
lines = []
|
||||
in_text = False
|
||||
line_start = 0
|
||||
for y in range(len(h_proj)):
|
||||
if h_proj[y] > threshold and not in_text:
|
||||
line_start = y
|
||||
in_text = True
|
||||
elif h_proj[y] <= threshold and in_text:
|
||||
if y - line_start > 5:
|
||||
lines.append((line_start, y))
|
||||
in_text = False
|
||||
if in_text and len(h_proj) - line_start > 5:
|
||||
lines.append((line_start, len(h_proj)))
|
||||
|
||||
all_words = []
|
||||
config = f'--psm {psm} --oem 3'
|
||||
|
||||
for line_y_start, line_y_end in lines:
|
||||
pad = 3
|
||||
y1 = max(0, line_y_start - pad)
|
||||
y2 = min(crop.shape[0], line_y_end + pad)
|
||||
line_crop = crop[y1:y2, :]
|
||||
|
||||
if line_crop.size == 0:
|
||||
continue
|
||||
|
||||
pil_img = Image.fromarray(line_crop)
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang=lang, config=config,
|
||||
output_type=pytesseract.Output.DICT)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
conf = int(data['conf'][i])
|
||||
if not text or conf < 10:
|
||||
continue
|
||||
all_words.append({
|
||||
'text': text,
|
||||
'left': data['left'][i] + region.x,
|
||||
'top': data['top'][i] + region.y + y1,
|
||||
'width': data['width'][i],
|
||||
'height': data['height'][i],
|
||||
'conf': conf,
|
||||
'region_type': region.type,
|
||||
})
|
||||
|
||||
return all_words
|
||||
|
||||
|
||||
def run_multi_pass_ocr(ocr_img: np.ndarray,
|
||||
regions: List[PageRegion],
|
||||
lang: str = "eng+deu") -> Dict[str, List[Dict]]:
|
||||
"""Run OCR on each detected region with optimized settings."""
|
||||
results: Dict[str, List[Dict]] = {}
|
||||
|
||||
_ocr_skip = {'header', 'footer', 'margin_top', 'margin_bottom', 'margin_left', 'margin_right'}
|
||||
for region in regions:
|
||||
if region.type in _ocr_skip:
|
||||
continue
|
||||
|
||||
if region.type == 'column_en':
|
||||
words = ocr_region(ocr_img, region, lang='eng', psm=4)
|
||||
elif region.type == 'column_de':
|
||||
words = ocr_region(ocr_img, region, lang='deu', psm=4)
|
||||
elif region.type == 'column_example':
|
||||
words = ocr_region(ocr_img, region, lang=lang, psm=6,
|
||||
fallback_psm=7, min_confidence=40.0)
|
||||
else:
|
||||
words = ocr_region(ocr_img, region, lang=lang, psm=6)
|
||||
|
||||
results[region.type] = words
|
||||
logger.info(f"OCR {region.type}: {len(words)} words")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 7: Line Alignment -> Vocabulary Entries
|
||||
# =============================================================================
|
||||
|
||||
def match_lines_to_vocab(ocr_results: Dict[str, List[Dict]],
|
||||
regions: List[PageRegion],
|
||||
y_tolerance_px: int = 25) -> List[VocabRow]:
|
||||
"""Align OCR results from different columns into vocabulary rows."""
|
||||
if 'column_en' not in ocr_results and 'column_de' not in ocr_results:
|
||||
logger.info("match_lines_to_vocab: no column_en/column_de in OCR results, returning empty")
|
||||
return []
|
||||
|
||||
en_lines = _group_words_into_lines(ocr_results.get('column_en', []), y_tolerance_px)
|
||||
de_lines = _group_words_into_lines(ocr_results.get('column_de', []), y_tolerance_px)
|
||||
ex_lines = _group_words_into_lines(ocr_results.get('column_example', []), y_tolerance_px)
|
||||
|
||||
def line_y_center(line: List[Dict]) -> float:
|
||||
return sum(w['top'] + w['height'] / 2 for w in line) / len(line)
|
||||
|
||||
def line_text(line: List[Dict]) -> str:
|
||||
return ' '.join(w['text'] for w in line)
|
||||
|
||||
def line_confidence(line: List[Dict]) -> float:
|
||||
return sum(w['conf'] for w in line) / len(line) if line else 0
|
||||
|
||||
vocab_rows: List[VocabRow] = []
|
||||
|
||||
for en_line in en_lines:
|
||||
en_y = line_y_center(en_line)
|
||||
en_text = line_text(en_line)
|
||||
en_conf = line_confidence(en_line)
|
||||
|
||||
if len(en_text.strip()) < 2:
|
||||
continue
|
||||
|
||||
de_text = ""
|
||||
de_conf = 0.0
|
||||
best_de_dist = float('inf')
|
||||
best_de_idx = -1
|
||||
for idx, de_line in enumerate(de_lines):
|
||||
dist = abs(line_y_center(de_line) - en_y)
|
||||
if dist < y_tolerance_px and dist < best_de_dist:
|
||||
best_de_dist = dist
|
||||
best_de_idx = idx
|
||||
|
||||
if best_de_idx >= 0:
|
||||
de_text = line_text(de_lines[best_de_idx])
|
||||
de_conf = line_confidence(de_lines[best_de_idx])
|
||||
|
||||
ex_text = ""
|
||||
ex_conf = 0.0
|
||||
best_ex_dist = float('inf')
|
||||
best_ex_idx = -1
|
||||
for idx, ex_line in enumerate(ex_lines):
|
||||
dist = abs(line_y_center(ex_line) - en_y)
|
||||
if dist < y_tolerance_px and dist < best_ex_dist:
|
||||
best_ex_dist = dist
|
||||
best_ex_idx = idx
|
||||
|
||||
if best_ex_idx >= 0:
|
||||
ex_text = line_text(ex_lines[best_ex_idx])
|
||||
ex_conf = line_confidence(ex_lines[best_ex_idx])
|
||||
|
||||
avg_conf = en_conf
|
||||
conf_count = 1
|
||||
if de_conf > 0:
|
||||
avg_conf += de_conf
|
||||
conf_count += 1
|
||||
if ex_conf > 0:
|
||||
avg_conf += ex_conf
|
||||
conf_count += 1
|
||||
|
||||
vocab_rows.append(VocabRow(
|
||||
english=en_text.strip(),
|
||||
german=de_text.strip(),
|
||||
example=ex_text.strip(),
|
||||
confidence=avg_conf / conf_count,
|
||||
y_position=int(en_y),
|
||||
))
|
||||
|
||||
# Handle multi-line wrapping in example column
|
||||
matched_ex_ys = set()
|
||||
for row in vocab_rows:
|
||||
if row.example:
|
||||
matched_ex_ys.add(row.y_position)
|
||||
|
||||
for ex_line in ex_lines:
|
||||
ex_y = line_y_center(ex_line)
|
||||
already_matched = any(abs(ex_y - y) < y_tolerance_px for y in matched_ex_ys)
|
||||
if already_matched:
|
||||
continue
|
||||
|
||||
best_row = None
|
||||
best_dist = float('inf')
|
||||
for row in vocab_rows:
|
||||
dist = ex_y - row.y_position
|
||||
if 0 < dist < y_tolerance_px * 3 and dist < best_dist:
|
||||
best_dist = dist
|
||||
best_row = row
|
||||
|
||||
if best_row:
|
||||
continuation = line_text(ex_line).strip()
|
||||
if continuation:
|
||||
best_row.example = (best_row.example + " " + continuation).strip()
|
||||
|
||||
vocab_rows.sort(key=lambda r: r.y_position)
|
||||
|
||||
return vocab_rows
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 8: Optional LLM Post-Correction
|
||||
# =============================================================================
|
||||
|
||||
async def llm_post_correct(img: np.ndarray, vocab_rows: List[VocabRow],
|
||||
confidence_threshold: float = 50.0,
|
||||
enabled: bool = False) -> List[VocabRow]:
|
||||
"""Optionally send low-confidence regions to Qwen-VL for correction."""
|
||||
if not enabled:
|
||||
return vocab_rows
|
||||
|
||||
logger.info(f"LLM post-correction skipped (not yet implemented)")
|
||||
return vocab_rows
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Orchestrator
|
||||
# =============================================================================
|
||||
|
||||
async def run_cv_pipeline(
|
||||
pdf_data: Optional[bytes] = None,
|
||||
image_data: Optional[bytes] = None,
|
||||
page_number: int = 0,
|
||||
zoom: float = 3.0,
|
||||
enable_dewarp: bool = True,
|
||||
enable_llm_correction: bool = False,
|
||||
lang: str = "eng+deu",
|
||||
) -> PipelineResult:
|
||||
"""Run the complete CV document reconstruction pipeline."""
|
||||
if not CV_PIPELINE_AVAILABLE:
|
||||
return PipelineResult(error="CV pipeline not available (OpenCV or Tesseract missing)")
|
||||
|
||||
result = PipelineResult()
|
||||
total_start = time.time()
|
||||
|
||||
try:
|
||||
# Stage 1: Render
|
||||
t = time.time()
|
||||
if pdf_data:
|
||||
img = render_pdf_high_res(pdf_data, page_number, zoom)
|
||||
elif image_data:
|
||||
img = render_image_high_res(image_data)
|
||||
else:
|
||||
return PipelineResult(error="No input data (pdf_data or image_data required)")
|
||||
result.stages['render'] = round(time.time() - t, 2)
|
||||
result.image_width = img.shape[1]
|
||||
result.image_height = img.shape[0]
|
||||
logger.info(f"Stage 1 (render): {img.shape[1]}x{img.shape[0]} in {result.stages['render']}s")
|
||||
|
||||
# Stage 2: Deskew
|
||||
t = time.time()
|
||||
img, angle = deskew_image(img)
|
||||
result.stages['deskew'] = round(time.time() - t, 2)
|
||||
logger.info(f"Stage 2 (deskew): {angle:.2f}\u00b0 in {result.stages['deskew']}s")
|
||||
|
||||
# Stage 3: Dewarp
|
||||
if enable_dewarp:
|
||||
t = time.time()
|
||||
img, _dewarp_info = dewarp_image(img)
|
||||
result.stages['dewarp'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 4: Dual image preparation
|
||||
t = time.time()
|
||||
ocr_img = create_ocr_image(img)
|
||||
layout_img = create_layout_image(img)
|
||||
result.stages['image_prep'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 5: Layout analysis
|
||||
t = time.time()
|
||||
regions = analyze_layout(layout_img, ocr_img)
|
||||
result.stages['layout'] = round(time.time() - t, 2)
|
||||
result.columns_detected = len([r for r in regions if r.type.startswith('column')])
|
||||
logger.info(f"Stage 5 (layout): {result.columns_detected} columns in {result.stages['layout']}s")
|
||||
|
||||
# Stage 6: Multi-pass OCR
|
||||
t = time.time()
|
||||
ocr_results = run_multi_pass_ocr(ocr_img, regions, lang)
|
||||
result.stages['ocr'] = round(time.time() - t, 2)
|
||||
total_words = sum(len(w) for w in ocr_results.values())
|
||||
result.word_count = total_words
|
||||
logger.info(f"Stage 6 (OCR): {total_words} words in {result.stages['ocr']}s")
|
||||
|
||||
# Stage 7: Line alignment
|
||||
t = time.time()
|
||||
vocab_rows = match_lines_to_vocab(ocr_results, regions)
|
||||
result.stages['alignment'] = round(time.time() - t, 2)
|
||||
|
||||
# Stage 8: Optional LLM correction
|
||||
if enable_llm_correction:
|
||||
t = time.time()
|
||||
vocab_rows = await llm_post_correct(img, vocab_rows)
|
||||
result.stages['llm_correction'] = round(time.time() - t, 2)
|
||||
|
||||
# Convert to output format
|
||||
result.vocabulary = [
|
||||
{
|
||||
"english": row.english,
|
||||
"german": row.german,
|
||||
"example": row.example,
|
||||
"confidence": round(row.confidence, 1),
|
||||
}
|
||||
for row in vocab_rows
|
||||
if row.english or row.german
|
||||
]
|
||||
|
||||
result.duration_seconds = round(time.time() - total_start, 2)
|
||||
logger.info(f"CV Pipeline complete: {len(result.vocabulary)} entries in {result.duration_seconds}s")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CV Pipeline error: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
result.error = str(e)
|
||||
result.duration_seconds = round(time.time() - total_start, 2)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
Multi-pass OCR, line matching, LLM/spell review, and pipeline orchestration.
|
||||
|
||||
Re-export facade -- all logic lives in the sub-modules:
|
||||
|
||||
cv_review_pipeline Stages 6-8: OCR, line alignment, orchestrator
|
||||
cv_review_spell Rule-based spell-checker OCR correction
|
||||
cv_review_llm LLM-based OCR correction, prompt building, streaming
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
# Re-export everything for backward compatibility
|
||||
from cv_review_pipeline import ( # noqa: F401
|
||||
ocr_region,
|
||||
run_multi_pass_ocr,
|
||||
match_lines_to_vocab,
|
||||
llm_post_correct,
|
||||
run_cv_pipeline,
|
||||
)
|
||||
|
||||
from cv_review_spell import ( # noqa: F401
|
||||
_SPELL_AVAILABLE,
|
||||
_spell_dict_knows,
|
||||
_spell_fix_field,
|
||||
_spell_fix_token,
|
||||
_try_split_merged_word,
|
||||
_normalize_page_ref,
|
||||
spell_review_entries_sync,
|
||||
spell_review_entries_streaming,
|
||||
)
|
||||
|
||||
from cv_review_llm import ( # noqa: F401
|
||||
OLLAMA_REVIEW_MODEL,
|
||||
REVIEW_ENGINE,
|
||||
_REVIEW_BATCH_SIZE,
|
||||
_build_llm_prompt,
|
||||
_diff_batch,
|
||||
_entry_needs_review,
|
||||
_is_spurious_change,
|
||||
_parse_llm_json_array,
|
||||
_sanitize_for_json,
|
||||
llm_review_entries,
|
||||
llm_review_entries_streaming,
|
||||
)
|
||||
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
CV Review Spell — Rule-based OCR spell correction (no LLM).
|
||||
|
||||
Provides dictionary-backed digit-to-letter substitution, umlaut correction,
|
||||
general spell correction, merged-word splitting, and page-ref normalization.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from spellchecker import SpellChecker as _SpellChecker
|
||||
_en_spell = _SpellChecker(language='en', distance=1)
|
||||
_de_spell = _SpellChecker(language='de', distance=1)
|
||||
_SPELL_AVAILABLE = True
|
||||
logger.info("pyspellchecker loaded (EN+DE)")
|
||||
except ImportError:
|
||||
_SPELL_AVAILABLE = False
|
||||
_en_spell = None # type: ignore[assignment]
|
||||
_de_spell = None # type: ignore[assignment]
|
||||
logger.warning("pyspellchecker not installed")
|
||||
|
||||
|
||||
# ---- Page-Ref Normalization ----
|
||||
# Normalizes OCR variants like "p-60", "p 61", "p60" -> "p.60"
|
||||
_PAGE_REF_RE = re.compile(r'\bp[\s\-]?(\d+)', re.IGNORECASE)
|
||||
|
||||
|
||||
def _normalize_page_ref(text: str) -> str:
|
||||
"""Normalize page references: 'p-60' / 'p 61' / 'p60' -> 'p.60'."""
|
||||
if not text:
|
||||
return text
|
||||
return _PAGE_REF_RE.sub(lambda m: f"p.{m.group(1)}", text)
|
||||
|
||||
|
||||
# Suspicious OCR chars -> ordered list of most-likely correct replacements
|
||||
_SPELL_SUBS: Dict[str, List[str]] = {
|
||||
'0': ['O', 'o'],
|
||||
'1': ['l', 'I'],
|
||||
'5': ['S', 's'],
|
||||
'6': ['G', 'g'],
|
||||
'8': ['B', 'b'],
|
||||
'|': ['I', 'l', '1'],
|
||||
}
|
||||
_SPELL_SUSPICIOUS = frozenset(_SPELL_SUBS.keys())
|
||||
|
||||
# Tokenizer: word tokens (letters + pipe) alternating with separators
|
||||
_SPELL_TOKEN_RE = re.compile(r'([A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]+)([^A-Za-z\u00c4\u00d6\u00dc\u00e4\u00f6\u00fc\u00df|]*)')
|
||||
|
||||
|
||||
def _spell_dict_knows(word: str) -> bool:
|
||||
"""True if word is known in EN or DE dictionary."""
|
||||
if not _SPELL_AVAILABLE:
|
||||
return False
|
||||
w = word.lower()
|
||||
return bool(_en_spell.known([w])) or bool(_de_spell.known([w]))
|
||||
|
||||
|
||||
def _try_split_merged_word(token: str) -> Optional[str]:
|
||||
"""Try to split a merged word like 'atmyschool' into 'at my school'.
|
||||
|
||||
Uses dynamic programming to find the shortest sequence of dictionary
|
||||
words that covers the entire token. Only returns a result when the
|
||||
split produces at least 2 words and ALL parts are known dictionary words.
|
||||
|
||||
Preserves original capitalisation by mapping back to the input string.
|
||||
"""
|
||||
if not _SPELL_AVAILABLE or len(token) < 4:
|
||||
return None
|
||||
|
||||
lower = token.lower()
|
||||
n = len(lower)
|
||||
|
||||
# dp[i] = (word_lengths_list, score) for best split of lower[:i], or None
|
||||
dp: list = [None] * (n + 1)
|
||||
dp[0] = ([], 0)
|
||||
|
||||
for i in range(1, n + 1):
|
||||
for j in range(max(0, i - 20), i):
|
||||
if dp[j] is None:
|
||||
continue
|
||||
candidate = lower[j:i]
|
||||
word_len = i - j
|
||||
if word_len == 1 and candidate not in ('a', 'i'):
|
||||
continue
|
||||
if _spell_dict_knows(candidate):
|
||||
prev_words, prev_sq = dp[j]
|
||||
new_words = prev_words + [word_len]
|
||||
new_sq = prev_sq + word_len * word_len
|
||||
new_key = (-len(new_words), new_sq)
|
||||
if dp[i] is None:
|
||||
dp[i] = (new_words, new_sq)
|
||||
else:
|
||||
old_key = (-len(dp[i][0]), dp[i][1])
|
||||
if new_key >= old_key:
|
||||
dp[i] = (new_words, new_sq)
|
||||
|
||||
if dp[n] is None or len(dp[n][0]) < 2:
|
||||
return None
|
||||
|
||||
result = []
|
||||
pos = 0
|
||||
for wlen in dp[n][0]:
|
||||
result.append(token[pos:pos + wlen])
|
||||
pos += wlen
|
||||
|
||||
logger.debug("Split merged word: %r -> %r", token, " ".join(result))
|
||||
return " ".join(result)
|
||||
|
||||
|
||||
def _spell_fix_token(token: str, field: str = "") -> Optional[str]:
|
||||
"""Return corrected form of token, or None if no fix needed/possible.
|
||||
|
||||
*field* is 'english' or 'german' -- used to pick the right dictionary.
|
||||
"""
|
||||
has_suspicious = any(ch in _SPELL_SUSPICIOUS for ch in token)
|
||||
|
||||
# 1. Already known word -> no fix needed
|
||||
if _spell_dict_knows(token):
|
||||
return None
|
||||
|
||||
# 2. Digit/pipe substitution
|
||||
if has_suspicious:
|
||||
if token == '|':
|
||||
return 'I'
|
||||
for i, ch in enumerate(token):
|
||||
if ch not in _SPELL_SUBS:
|
||||
continue
|
||||
for replacement in _SPELL_SUBS[ch]:
|
||||
candidate = token[:i] + replacement + token[i + 1:]
|
||||
if _spell_dict_knows(candidate):
|
||||
return candidate
|
||||
first = token[0]
|
||||
if first in _SPELL_SUBS and len(token) >= 2:
|
||||
rest = token[1:]
|
||||
if rest.isalpha() and rest.islower():
|
||||
candidate = _SPELL_SUBS[first][0] + rest
|
||||
if not candidate[0].isdigit():
|
||||
return candidate
|
||||
|
||||
# 3. OCR umlaut confusion
|
||||
if len(token) >= 3 and token.isalpha() and field == "german":
|
||||
_UMLAUT_SUBS = {'a': '\u00e4', 'o': '\u00f6', 'u': '\u00fc', 'i': '\u00fc',
|
||||
'A': '\u00c4', 'O': '\u00d6', 'U': '\u00dc', 'I': '\u00dc'}
|
||||
for i, ch in enumerate(token):
|
||||
if ch in _UMLAUT_SUBS:
|
||||
candidate = token[:i] + _UMLAUT_SUBS[ch] + token[i + 1:]
|
||||
if _spell_dict_knows(candidate):
|
||||
return candidate
|
||||
|
||||
# 4. General spell correction for unknown words (no digits/pipes)
|
||||
if not has_suspicious and len(token) >= 3 and token.isalpha():
|
||||
spell = _en_spell if field == "english" else _de_spell if field == "german" else None
|
||||
if spell is not None:
|
||||
correction = spell.correction(token.lower())
|
||||
if correction and correction != token.lower():
|
||||
if token[0].isupper():
|
||||
correction = correction[0].upper() + correction[1:]
|
||||
if _spell_dict_knows(correction):
|
||||
return correction
|
||||
|
||||
# 5. Merged-word split
|
||||
if len(token) >= 4 and token.isalpha():
|
||||
split = _try_split_merged_word(token)
|
||||
if split:
|
||||
return split
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _spell_fix_field(text: str, field: str = "") -> Tuple[str, bool]:
|
||||
"""Apply OCR corrections to a text field. Returns (fixed_text, was_changed)."""
|
||||
if not text:
|
||||
return text, False
|
||||
has_suspicious = any(ch in text for ch in _SPELL_SUSPICIOUS)
|
||||
if not has_suspicious and not any(c.isalpha() for c in text):
|
||||
return text, False
|
||||
# Pattern: | immediately before . or , -> numbered list prefix
|
||||
fixed = re.sub(r'(?<!\w)\|(?=[.,])', '1', text) if has_suspicious else text
|
||||
changed = fixed != text
|
||||
# Tokenize and fix word by word
|
||||
parts: List[str] = []
|
||||
pos = 0
|
||||
for m in _SPELL_TOKEN_RE.finditer(fixed):
|
||||
token, sep = m.group(1), m.group(2)
|
||||
correction = _spell_fix_token(token, field=field)
|
||||
if correction:
|
||||
parts.append(correction)
|
||||
changed = True
|
||||
else:
|
||||
parts.append(token)
|
||||
parts.append(sep)
|
||||
pos = m.end()
|
||||
if pos < len(fixed):
|
||||
parts.append(fixed[pos:])
|
||||
return ''.join(parts), changed
|
||||
|
||||
|
||||
def spell_review_entries_sync(entries: List[Dict]) -> Dict:
|
||||
"""Rule-based OCR correction: spell-checker + structural heuristics.
|
||||
|
||||
Deterministic -- never translates, never touches IPA, never hallucinates.
|
||||
Uses SmartSpellChecker for language-aware corrections with context-based
|
||||
disambiguation (a/I), multi-digit substitution, and cross-language guard.
|
||||
"""
|
||||
from cv_review_llm import _entry_needs_review
|
||||
|
||||
t0 = time.time()
|
||||
changes: List[Dict] = []
|
||||
all_corrected: List[Dict] = []
|
||||
|
||||
# Use SmartSpellChecker if available
|
||||
_smart = None
|
||||
try:
|
||||
from smart_spell import SmartSpellChecker
|
||||
_smart = SmartSpellChecker()
|
||||
logger.debug("spell_review: using SmartSpellChecker")
|
||||
except Exception:
|
||||
logger.debug("spell_review: SmartSpellChecker not available, using legacy")
|
||||
|
||||
_LANG_MAP = {"english": "en", "german": "de", "example": "auto"}
|
||||
|
||||
for i, entry in enumerate(entries):
|
||||
e = dict(entry)
|
||||
# Page-ref normalization
|
||||
old_ref = (e.get("source_page") or "").strip()
|
||||
if old_ref:
|
||||
new_ref = _normalize_page_ref(old_ref)
|
||||
if new_ref != old_ref:
|
||||
changes.append({
|
||||
"row_index": e.get("row_index", i),
|
||||
"field": "source_page",
|
||||
"old": old_ref,
|
||||
"new": new_ref,
|
||||
})
|
||||
e["source_page"] = new_ref
|
||||
e["llm_corrected"] = True
|
||||
if not _entry_needs_review(e):
|
||||
all_corrected.append(e)
|
||||
continue
|
||||
for field_name in ("english", "german", "example"):
|
||||
old_val = (e.get(field_name) or "").strip()
|
||||
if not old_val:
|
||||
continue
|
||||
|
||||
if _smart:
|
||||
lang_code = _LANG_MAP.get(field_name, "en")
|
||||
result = _smart.correct_text(old_val, lang=lang_code)
|
||||
new_val = result.corrected
|
||||
was_changed = result.changed
|
||||
else:
|
||||
lang = "german" if field_name in ("german", "example") else "english"
|
||||
new_val, was_changed = _spell_fix_field(old_val, field=lang)
|
||||
|
||||
if was_changed and new_val != old_val:
|
||||
changes.append({
|
||||
"row_index": e.get("row_index", i),
|
||||
"field": field_name,
|
||||
"old": old_val,
|
||||
"new": new_val,
|
||||
})
|
||||
e[field_name] = new_val
|
||||
e["llm_corrected"] = True
|
||||
all_corrected.append(e)
|
||||
duration_ms = int((time.time() - t0) * 1000)
|
||||
model_name = "smart-spell-checker" if _smart else "spell-checker"
|
||||
return {
|
||||
"entries_original": entries,
|
||||
"entries_corrected": all_corrected,
|
||||
"changes": changes,
|
||||
"skipped_count": 0,
|
||||
"model_used": model_name,
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
async def spell_review_entries_streaming(entries: List[Dict], batch_size: int = 50):
|
||||
"""Async generator yielding SSE-compatible events for spell-checker review."""
|
||||
total = len(entries)
|
||||
yield {
|
||||
"type": "meta",
|
||||
"total_entries": total,
|
||||
"to_review": total,
|
||||
"skipped": 0,
|
||||
"model": "spell-checker",
|
||||
"batch_size": batch_size,
|
||||
}
|
||||
result = spell_review_entries_sync(entries)
|
||||
changes = result["changes"]
|
||||
yield {
|
||||
"type": "batch",
|
||||
"batch_index": 0,
|
||||
"entries_reviewed": [e.get("row_index", i) for i, e in enumerate(entries)],
|
||||
"changes": changes,
|
||||
"duration_ms": result["duration_ms"],
|
||||
"progress": {"current": total, "total": total},
|
||||
}
|
||||
yield {
|
||||
"type": "complete",
|
||||
"changes": changes,
|
||||
"model_used": "spell-checker",
|
||||
"duration_ms": result["duration_ms"],
|
||||
"total_entries": total,
|
||||
"reviewed": total,
|
||||
"skipped": 0,
|
||||
"corrections_found": len(changes),
|
||||
"entries_corrected": result["entries_corrected"],
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Shared types, constants, and availability guards for the CV vocabulary pipeline.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re # noqa: F401 — re-exported for downstream modules
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np # noqa: F401
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Availability Guards ---
|
||||
|
||||
try:
|
||||
import cv2 # noqa: F401
|
||||
CV2_AVAILABLE = True
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
CV2_AVAILABLE = False
|
||||
logger.warning("OpenCV not available — CV pipeline disabled")
|
||||
|
||||
try:
|
||||
import pytesseract # noqa: F401
|
||||
from PIL import Image # noqa: F401
|
||||
TESSERACT_AVAILABLE = True
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
TESSERACT_AVAILABLE = False
|
||||
logger.warning("pytesseract/Pillow not available — CV pipeline disabled")
|
||||
|
||||
CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
||||
|
||||
# --- IPA Dictionary ---
|
||||
|
||||
IPA_AVAILABLE = False
|
||||
_ipa_convert_american = None
|
||||
_britfone_dict: Dict[str, str] = {}
|
||||
|
||||
try:
|
||||
import eng_to_ipa as _eng_to_ipa
|
||||
_ipa_convert_american = _eng_to_ipa.convert
|
||||
IPA_AVAILABLE = True
|
||||
logger.info("eng_to_ipa available — American IPA lookup enabled")
|
||||
except ImportError:
|
||||
logger.info("eng_to_ipa not installed — American IPA disabled")
|
||||
|
||||
# Load Britfone dictionary (MIT license, ~15k British English IPA entries)
|
||||
_britfone_path = os.path.join(os.path.dirname(__file__), 'data', 'britfone_ipa.json')
|
||||
if os.path.exists(_britfone_path):
|
||||
try:
|
||||
with open(_britfone_path, 'r', encoding='utf-8') as f:
|
||||
_britfone_dict = json.load(f)
|
||||
IPA_AVAILABLE = True
|
||||
logger.info(f"Britfone loaded — {len(_britfone_dict)} British IPA entries")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load Britfone: {e}")
|
||||
else:
|
||||
logger.info("Britfone not found — British IPA disabled")
|
||||
|
||||
# --- German IPA Dictionary (CC-BY-SA, Wiktionary) ---
|
||||
|
||||
DE_IPA_AVAILABLE = False
|
||||
_de_ipa_dict: Dict[str, str] = {}
|
||||
|
||||
_de_ipa_path = os.path.join(os.path.dirname(__file__), 'data', 'de_ipa.tsv')
|
||||
if os.path.exists(_de_ipa_path):
|
||||
try:
|
||||
with open(_de_ipa_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
parts = line.rstrip('\n').split('\t', 1)
|
||||
if len(parts) == 2:
|
||||
_de_ipa_dict[parts[0]] = parts[1]
|
||||
DE_IPA_AVAILABLE = True
|
||||
logger.info(f"German IPA loaded — {len(_de_ipa_dict)} entries (CC-BY-SA, Wiktionary)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load German IPA: {e}")
|
||||
else:
|
||||
logger.info("German IPA not found — German IPA disabled")
|
||||
|
||||
# --- epitran German fallback (MIT license) ---
|
||||
|
||||
_epitran_de = None
|
||||
try:
|
||||
import epitran as _epitran_module
|
||||
_epitran_de = _epitran_module.Epitran('deu-Latn')
|
||||
logger.info("epitran loaded — German rule-based IPA fallback enabled")
|
||||
except ImportError:
|
||||
logger.info("epitran not installed — German IPA fallback disabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to init epitran: {e}")
|
||||
|
||||
# --- Language Detection Constants ---
|
||||
|
||||
GERMAN_FUNCTION_WORDS = {'der', 'die', 'das', 'und', 'ist', 'ein', 'eine', 'nicht',
|
||||
'von', 'zu', 'mit', 'auf', 'fuer', 'den', 'dem', 'sich', 'auch', 'wird',
|
||||
'nach', 'bei', 'aus', 'wie', 'oder', 'wenn', 'noch', 'aber', 'hat', 'nur',
|
||||
'ueber', 'kann', 'als', 'ich', 'er', 'sie', 'es', 'wir', 'ihr', 'haben',
|
||||
'sein', 'werden', 'war', 'sind', 'muss', 'soll', 'dieser', 'diese', 'diesem'}
|
||||
|
||||
ENGLISH_FUNCTION_WORDS = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'to', 'of',
|
||||
'and', 'in', 'that', 'it', 'for', 'on', 'with', 'as', 'at', 'by', 'from',
|
||||
'or', 'but', 'not', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||||
'would', 'can', 'could', 'should', 'may', 'might', 'this', 'they', 'you', 'he',
|
||||
'she', 'we', 'my', 'your', 'his', 'her', 'its', 'our', 'their', 'which'}
|
||||
|
||||
|
||||
# --- Data Classes ---
|
||||
|
||||
@dataclass
|
||||
class PageRegion:
|
||||
"""A detected region on the page."""
|
||||
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'column_text', 'header', 'footer', 'margin_top', 'margin_bottom', 'column_headword', 'column_article', 'column_ipa'
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
classification_confidence: float = 1.0 # 0.0-1.0
|
||||
classification_method: str = "" # 'content', 'position_enhanced', 'position_fallback'
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnGeometry:
|
||||
"""Geometrisch erkannte Spalte vor Typ-Klassifikation."""
|
||||
index: int # 0-basiert, links->rechts
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
word_count: int
|
||||
words: List[Dict] # Wort-Dicts aus Tesseract (text, conf, left, top, ...)
|
||||
width_ratio: float # width / content_width (0.0-1.0)
|
||||
is_sub_column: bool = False # True if created by _detect_sub_columns() split
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowGeometry:
|
||||
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||||
index: int # 0-basiert, oben→unten
|
||||
x: int # absolute left (= content left_x)
|
||||
y: int # absolute y start
|
||||
width: int # content width
|
||||
height: int # Zeilenhoehe in px
|
||||
word_count: int
|
||||
words: List[Dict]
|
||||
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||||
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||||
|
||||
|
||||
@dataclass
|
||||
class VocabRow:
|
||||
"""A single vocabulary entry assembled from multi-column OCR."""
|
||||
english: str = ""
|
||||
german: str = ""
|
||||
example: str = ""
|
||||
source_page: str = ""
|
||||
confidence: float = 0.0
|
||||
y_position: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Complete result of the CV pipeline."""
|
||||
vocabulary: List[Dict[str, Any]] = field(default_factory=list)
|
||||
word_count: int = 0
|
||||
columns_detected: int = 0
|
||||
duration_seconds: float = 0.0
|
||||
stages: Dict[str, float] = field(default_factory=dict)
|
||||
error: Optional[str] = None
|
||||
image_width: int = 0
|
||||
image_height: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocumentTypeResult:
|
||||
"""Result of automatic document type detection."""
|
||||
doc_type: str # 'vocab_table' | 'full_text' | 'generic_table'
|
||||
confidence: float # 0.0-1.0
|
||||
pipeline: str # 'cell_first' | 'full_page'
|
||||
skip_steps: List[str] = field(default_factory=list) # e.g. ['columns', 'rows']
|
||||
features: Dict[str, Any] = field(default_factory=dict) # debug info
|
||||
|
||||
|
||||
@dataclass
|
||||
class DetectedBox:
|
||||
"""An embedded box (e.g. grammar tip, exercise) detected on the page."""
|
||||
x: int # absolute pixel position
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
confidence: float # 0.0-1.0
|
||||
border_thickness: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageZone:
|
||||
"""A horizontal zone of the page — either normal content or a detected box."""
|
||||
index: int # 0-based, top to bottom
|
||||
zone_type: str # 'content' | 'box'
|
||||
y: int # absolute pixel y
|
||||
height: int
|
||||
x: int
|
||||
width: int
|
||||
box: Optional[DetectedBox] = None
|
||||
columns: List[ColumnGeometry] = field(default_factory=list)
|
||||
image_overlays: List[Dict] = field(default_factory=list)
|
||||
layout_hint: Optional[str] = None # 'left_of_vsplit', 'right_of_vsplit'
|
||||
vsplit_group: Optional[int] = None # group ID for side-by-side rendering
|
||||
@@ -0,0 +1,404 @@
|
||||
"""
|
||||
Words-First Grid Builder (Bottom-Up).
|
||||
|
||||
Builds a cell grid from Tesseract word_boxes directly, without requiring
|
||||
pre-detected columns or rows. Algorithm:
|
||||
|
||||
1. Cluster words into columns by X-gap analysis
|
||||
2. Cluster words into rows by Y-proximity
|
||||
3. Build cells at (column, row) intersections
|
||||
|
||||
Returns the same (cells, columns_meta) format as build_cell_grid_v2().
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import statistics
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_group_words_into_lines,
|
||||
_words_to_reading_order_text,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Column clustering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cluster_columns(
|
||||
words: List[Dict],
|
||||
img_w: int,
|
||||
min_gap_pct: float = 3.0,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into columns by finding large horizontal gaps.
|
||||
|
||||
Args:
|
||||
max_columns: If set, limits the number of columns by merging
|
||||
the closest adjacent pairs until the count matches.
|
||||
Prevents phantom columns from degraded OCR.
|
||||
|
||||
Returns a list of column dicts:
|
||||
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||||
sorted left-to-right.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Sort by X center
|
||||
sorted_w = sorted(words, key=lambda w: w['left'] + w['width'] / 2)
|
||||
|
||||
# Collect word heights to compute adaptive threshold
|
||||
heights = [w['height'] for w in sorted_w if w.get('height', 0) > 0]
|
||||
median_h = statistics.median(heights) if heights else 30
|
||||
|
||||
# Adaptive gap threshold: 3× median word height, but at least min_gap_pct of image width
|
||||
min_gap_px = max(median_h * 3, img_w * min_gap_pct / 100) if img_w > 0 else median_h * 3
|
||||
|
||||
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||||
# For each word, compute right edge; for next word, compute left edge
|
||||
# Collect gaps with their sizes for max_columns enforcement
|
||||
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
|
||||
for i in range(len(sorted_w) - 1):
|
||||
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||||
left_edge = sorted_w[i + 1]['left']
|
||||
gap = left_edge - right_edge
|
||||
if gap > min_gap_px:
|
||||
split_x = (right_edge + left_edge) / 2
|
||||
gaps.append((gap, split_x))
|
||||
|
||||
# If max_columns is set, keep only the (max_columns - 1) largest gaps
|
||||
if max_columns and len(gaps) >= max_columns:
|
||||
gaps.sort(key=lambda g: g[0], reverse=True)
|
||||
gaps = gaps[:max_columns - 1]
|
||||
logger.info(
|
||||
f"_cluster_columns: limited to {max_columns} columns "
|
||||
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
|
||||
)
|
||||
|
||||
boundaries = sorted(g[1] for g in gaps)
|
||||
|
||||
# Build column ranges from boundaries
|
||||
col_edges = [0.0] + boundaries + [float(img_w)]
|
||||
columns = []
|
||||
for ci in range(len(col_edges) - 1):
|
||||
columns.append({
|
||||
'index': ci,
|
||||
'type': f'column_{ci + 1}' if len(col_edges) > 2 else 'column_text',
|
||||
'x_min': col_edges[ci],
|
||||
'x_max': col_edges[ci + 1],
|
||||
})
|
||||
|
||||
return columns
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Row clustering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cluster_rows(
|
||||
words: List[Dict],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into visual rows by Y-proximity.
|
||||
|
||||
Uses half the median word height as Y-tolerance.
|
||||
|
||||
Returns a list of row dicts:
|
||||
[{'index': 0, 'y_min': ..., 'y_max': ..., 'y_center': ...}, ...]
|
||||
sorted top-to-bottom.
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
heights = [w['height'] for w in words if w.get('height', 0) > 0]
|
||||
median_h = statistics.median(heights) if heights else 20
|
||||
y_tol = max(median_h * 0.5, 5)
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=int(y_tol))
|
||||
|
||||
rows = []
|
||||
for ri, line_words in enumerate(lines):
|
||||
y_min = min(w['top'] for w in line_words)
|
||||
y_max = max(w['top'] + w['height'] for w in line_words)
|
||||
rows.append({
|
||||
'index': ri,
|
||||
'y_min': y_min,
|
||||
'y_max': y_max,
|
||||
'y_center': (y_min + y_max) / 2,
|
||||
})
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Build cells
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _assign_word_to_column(word: Dict, columns: List[Dict]) -> int:
|
||||
"""Return column index for a word based on overlap, then center, then nearest.
|
||||
|
||||
Three-pass strategy (consistent with _assign_row_words_to_columns):
|
||||
1. Overlap-based: assign to column with maximum horizontal overlap.
|
||||
2. Midpoint-range: if no overlap, use midpoints between adjacent columns.
|
||||
3. Nearest center: last resort fallback.
|
||||
"""
|
||||
w_left = word['left']
|
||||
w_right = w_left + word['width']
|
||||
w_center = w_left + word['width'] / 2
|
||||
|
||||
# Pass 1: overlap-based
|
||||
best_col = -1
|
||||
best_overlap = 0
|
||||
for col in columns:
|
||||
overlap = max(0, min(w_right, col['x_max']) - max(w_left, col['x_min']))
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_col = col['index']
|
||||
if best_col >= 0 and best_overlap > 0:
|
||||
return best_col
|
||||
|
||||
# Pass 2: midpoint-range (non-overlapping assignment zones)
|
||||
for ci, col in enumerate(columns):
|
||||
if ci == 0:
|
||||
assign_left = 0
|
||||
else:
|
||||
assign_left = (columns[ci - 1]['x_max'] + col['x_min']) / 2
|
||||
if ci == len(columns) - 1:
|
||||
assign_right = float('inf')
|
||||
else:
|
||||
assign_right = (col['x_max'] + columns[ci + 1]['x_min']) / 2
|
||||
if assign_left <= w_center < assign_right:
|
||||
return col['index']
|
||||
|
||||
# Pass 3: nearest column center
|
||||
return min(columns, key=lambda c: abs((c['x_min'] + c['x_max']) / 2 - w_center))['index']
|
||||
|
||||
|
||||
def _assign_word_to_row(word: Dict, rows: List[Dict]) -> int:
|
||||
"""Return row index for a word based on its Y-center.
|
||||
|
||||
When rows overlap (e.g. due to tall border-ghost characters inflating
|
||||
a row's y_max), prefer the row whose y_center is closest.
|
||||
"""
|
||||
y_center = word['top'] + word['height'] / 2
|
||||
# Find all rows whose y_range contains this word's center
|
||||
matching = [r for r in rows if r['y_min'] <= y_center <= r['y_max']]
|
||||
if matching:
|
||||
return min(matching, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||
# Fallback: nearest row by Y-center
|
||||
return min(rows, key=lambda r: abs(r['y_center'] - y_center))['index']
|
||||
|
||||
|
||||
def _build_cells(
|
||||
words: List[Dict],
|
||||
columns: List[Dict],
|
||||
rows: List[Dict],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build cell dicts from word assignments to (column, row) pairs."""
|
||||
if not columns or not rows:
|
||||
return []
|
||||
|
||||
# Bucket words into (col_idx, row_idx)
|
||||
buckets: Dict[Tuple[int, int], List[Dict]] = {}
|
||||
for w in words:
|
||||
ci = _assign_word_to_column(w, columns)
|
||||
ri = _assign_word_to_row(w, rows)
|
||||
buckets.setdefault((ci, ri), []).append(w)
|
||||
|
||||
cells = []
|
||||
for (ci, ri), cell_words in sorted(buckets.items(), key=lambda kv: (kv[0][1], kv[0][0])):
|
||||
col = columns[ci]
|
||||
row = rows[ri]
|
||||
|
||||
# Compute tight bbox from actual word positions
|
||||
x_min = min(w['left'] for w in cell_words)
|
||||
y_min = min(w['top'] for w in cell_words)
|
||||
x_max = max(w['left'] + w['width'] for w in cell_words)
|
||||
y_max = max(w['top'] + w['height'] for w in cell_words)
|
||||
bw = x_max - x_min
|
||||
bh = y_max - y_min
|
||||
|
||||
# Text from words in reading order
|
||||
text = _words_to_reading_order_text(cell_words, y_tolerance_px=max(10, int(bh * 0.4)))
|
||||
|
||||
# Average confidence
|
||||
confs = [w.get('conf', 0) for w in cell_words if w.get('conf', 0) > 0]
|
||||
avg_conf = sum(confs) / len(confs) if confs else 0.0
|
||||
|
||||
# Word boxes with absolute pixel coordinates (consistent with cv_cell_grid.py).
|
||||
# PaddleOCR returns phrase-level boxes (e.g. "competition [kompa'tifn]"),
|
||||
# but the overlay slide mechanism expects one box per word. Split multi-word
|
||||
# boxes into individual word positions proportional to character length.
|
||||
# Also split at "[" boundaries (IPA patterns like "badge[bxd3]").
|
||||
#
|
||||
# Sort in reading order: group by Y (same visual line), then sort by X.
|
||||
# Simple (top, left) sort fails when words on the same line have slightly
|
||||
# different top values (1-6px), causing wrong word order.
|
||||
y_tol_wb = max(10, int(bh * 0.4))
|
||||
reading_lines = _group_words_into_lines(cell_words, y_tolerance_px=y_tol_wb)
|
||||
ordered_cell_words = [w for line in reading_lines for w in line]
|
||||
|
||||
word_boxes = []
|
||||
for w in ordered_cell_words:
|
||||
raw_text = w.get('text', '').strip()
|
||||
# Split by whitespace, at "[" boundaries (IPA), and after leading "!"
|
||||
# e.g. "badge[bxd3]" → ["badge", "[bxd3]"]
|
||||
# e.g. "profit['proft]" → ["profit", "['proft]"]
|
||||
# e.g. "!Betonung" → ["!", "Betonung"]
|
||||
tokens = re.split(r'\s+|(?=\[)|(?<=!)(?=[A-Za-z\u00c0-\u024f])', raw_text)
|
||||
tokens = [t for t in tokens if t] # remove empty strings
|
||||
if len(tokens) <= 1:
|
||||
# Single word — keep as-is
|
||||
word_boxes.append({
|
||||
'text': raw_text,
|
||||
'left': w['left'],
|
||||
'top': w['top'],
|
||||
'width': w['width'],
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
})
|
||||
else:
|
||||
# Multi-word phrase — split proportionally by character count
|
||||
total_chars = sum(len(t) for t in tokens)
|
||||
if total_chars == 0:
|
||||
continue
|
||||
# Small gap between words (2% of box width per gap)
|
||||
n_gaps = len(tokens) - 1
|
||||
gap_px = w['width'] * 0.02
|
||||
usable_w = w['width'] - gap_px * n_gaps
|
||||
cursor = w['left']
|
||||
for t in tokens:
|
||||
token_w = max(1, usable_w * len(t) / total_chars)
|
||||
word_boxes.append({
|
||||
'text': t,
|
||||
'left': round(cursor),
|
||||
'top': w['top'],
|
||||
'width': round(token_w),
|
||||
'height': w['height'],
|
||||
'conf': w.get('conf', 0),
|
||||
})
|
||||
cursor += token_w + gap_px
|
||||
|
||||
cells.append({
|
||||
'cell_id': f"R{ri:02d}_C{ci}",
|
||||
'row_index': ri,
|
||||
'col_index': ci,
|
||||
'col_type': col['type'],
|
||||
'text': text,
|
||||
'confidence': round(avg_conf, 1),
|
||||
'bbox_px': {'x': x_min, 'y': y_min, 'w': bw, 'h': bh},
|
||||
'bbox_pct': {
|
||||
'x': round(x_min / img_w * 100, 2) if img_w else 0,
|
||||
'y': round(y_min / img_h * 100, 2) if img_h else 0,
|
||||
'w': round(bw / img_w * 100, 2) if img_w else 0,
|
||||
'h': round(bh / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
'word_boxes': word_boxes,
|
||||
'ocr_engine': 'words_first',
|
||||
'is_bold': False,
|
||||
})
|
||||
|
||||
return cells
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_grid_from_words(
|
||||
word_dicts: List[Dict],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
min_confidence: int = 30,
|
||||
box_rects: Optional[List[Dict]] = None,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||||
|
||||
Args:
|
||||
word_dicts: Flat list of word dicts with keys:
|
||||
text, left, top, width, height, conf
|
||||
(absolute pixel coordinates).
|
||||
img_w: Image width in pixels.
|
||||
img_h: Image height in pixels.
|
||||
min_confidence: Minimum OCR confidence to keep a word.
|
||||
box_rects: Optional list of box dicts with keys x, y, width, height.
|
||||
Words inside these boxes are excluded from column clustering
|
||||
(box-internal columns are detected separately in sub-sessions).
|
||||
|
||||
Returns:
|
||||
(cells, columns_meta) — same format as build_cell_grid_v2().
|
||||
cells: list of cell dicts with cell_id, bbox_px, bbox_pct, etc.
|
||||
columns_meta: list of {'index', 'type', 'x', 'width'} dicts.
|
||||
"""
|
||||
if not word_dicts:
|
||||
logger.info("build_grid_from_words: no words — returning empty grid")
|
||||
return [], []
|
||||
|
||||
# Filter by confidence
|
||||
words = [
|
||||
w for w in word_dicts
|
||||
if w.get('conf', 0) >= min_confidence and w.get('text', '').strip()
|
||||
]
|
||||
if not words:
|
||||
logger.info("build_grid_from_words: all words filtered (conf < %d)", min_confidence)
|
||||
return [], []
|
||||
|
||||
logger.info("build_grid_from_words: %d words (after confidence filter from %d)", len(words), len(word_dicts))
|
||||
|
||||
# Exclude words inside detected boxes — box columns are detected separately
|
||||
if box_rects:
|
||||
content_words = []
|
||||
for w in words:
|
||||
w_cx = w['left'] + w['width'] / 2
|
||||
w_cy = w['top'] + w['height'] / 2
|
||||
inside = any(
|
||||
b['x'] <= w_cx <= b['x'] + b['width']
|
||||
and b['y'] <= w_cy <= b['y'] + b['height']
|
||||
for b in box_rects
|
||||
)
|
||||
if not inside:
|
||||
content_words.append(w)
|
||||
excluded = len(words) - len(content_words)
|
||||
if excluded:
|
||||
logger.info("build_grid_from_words: excluded %d words inside %d box(es)",
|
||||
excluded, len(box_rects))
|
||||
words = content_words
|
||||
if not words:
|
||||
logger.info("build_grid_from_words: all words inside boxes — returning empty grid")
|
||||
return [], []
|
||||
|
||||
# Step 1: cluster columns
|
||||
columns = _cluster_columns(words, img_w, max_columns=max_columns)
|
||||
logger.info("build_grid_from_words: %d column(s) detected%s",
|
||||
len(columns), f" (max={max_columns})" if max_columns else "")
|
||||
|
||||
# Step 2: cluster rows
|
||||
rows = _cluster_rows(words)
|
||||
logger.info("build_grid_from_words: %d row(s) detected", len(rows))
|
||||
|
||||
# Step 3: build cells
|
||||
cells = _build_cells(words, columns, rows, img_w, img_h)
|
||||
logger.info("build_grid_from_words: %d cells built", len(cells))
|
||||
|
||||
# Build columns_meta in same format as build_cell_grid_v2
|
||||
columns_meta = []
|
||||
for col in columns:
|
||||
x = int(col['x_min'])
|
||||
w = int(col['x_max'] - col['x_min'])
|
||||
columns_meta.append({
|
||||
'index': col['index'],
|
||||
'type': col['type'],
|
||||
'x': x,
|
||||
'width': w,
|
||||
})
|
||||
|
||||
return cells, columns_meta
|
||||
Reference in New Issue
Block a user