feat(ocr-pipeline): add Step 5 word recognition (grid from columns × rows)
Backend: build_word_grid() intersects column regions with content rows, OCRs each cell with language-specific Tesseract, and returns vocabulary entries with percent-based bounding boxes. New endpoints: POST /words, GET /image/words-overlay, ground-truth save/retrieve for words. Frontend: StepWordRecognition with overview + step-through labeling modes, goToStep callback for row correction feedback loop. MkDocs: OCR Pipeline documentation added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2169,6 +2169,142 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
||||
return regions
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pipeline Step 5: Word Grid from Columns × Rows
|
||||
# =============================================================================
|
||||
|
||||
def build_word_grid(
|
||||
ocr_img: np.ndarray,
|
||||
column_regions: List[PageRegion],
|
||||
row_geometries: List[RowGeometry],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
lang: str = "eng+deu",
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build a word grid by intersecting columns and rows, then OCR each cell.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized full-page image.
|
||||
column_regions: Classified columns from Step 3 (PageRegion list).
|
||||
row_geometries: Rows from Step 4 (RowGeometry list).
|
||||
img_w: Image width in pixels.
|
||||
img_h: Image height in pixels.
|
||||
lang: Default Tesseract language.
|
||||
|
||||
Returns:
|
||||
List of entry dicts with english/german/example text and bbox info (percent).
|
||||
"""
|
||||
# Filter to content rows only (skip header/footer)
|
||||
content_rows = [r for r in row_geometries if r.row_type == 'content']
|
||||
if not content_rows:
|
||||
logger.warning("build_word_grid: no content rows found")
|
||||
return []
|
||||
|
||||
# Map column types to roles
|
||||
VOCAB_COLUMN_TYPES = {'column_en', 'column_de', 'column_example'}
|
||||
relevant_cols = [c for c in column_regions if c.type in VOCAB_COLUMN_TYPES]
|
||||
if not relevant_cols:
|
||||
logger.warning("build_word_grid: no relevant vocabulary columns found")
|
||||
return []
|
||||
|
||||
# Sort columns left-to-right
|
||||
relevant_cols.sort(key=lambda c: c.x)
|
||||
|
||||
# Choose OCR language per column type
|
||||
lang_map = {
|
||||
'column_en': 'eng',
|
||||
'column_de': 'deu',
|
||||
'column_example': 'eng+deu',
|
||||
}
|
||||
|
||||
entries: List[Dict[str, Any]] = []
|
||||
|
||||
for row_idx, row in enumerate(content_rows):
|
||||
entry: Dict[str, Any] = {
|
||||
'row_index': row_idx,
|
||||
'english': '',
|
||||
'german': '',
|
||||
'example': '',
|
||||
'confidence': 0.0,
|
||||
'bbox': {
|
||||
'x': round(row.x / img_w * 100, 2),
|
||||
'y': round(row.y / img_h * 100, 2),
|
||||
'w': round(row.width / img_w * 100, 2),
|
||||
'h': round(row.height / img_h * 100, 2),
|
||||
},
|
||||
'bbox_en': None,
|
||||
'bbox_de': None,
|
||||
'bbox_ex': None,
|
||||
}
|
||||
|
||||
confidences: List[float] = []
|
||||
|
||||
for col in relevant_cols:
|
||||
# Compute cell region: column x/width, row y/height
|
||||
cell_x = col.x
|
||||
cell_y = row.y
|
||||
cell_w = col.width
|
||||
cell_h = row.height
|
||||
|
||||
# Clamp to image bounds
|
||||
cell_x = max(0, cell_x)
|
||||
cell_y = max(0, cell_y)
|
||||
if cell_x + cell_w > img_w:
|
||||
cell_w = img_w - cell_x
|
||||
if cell_y + cell_h > img_h:
|
||||
cell_h = img_h - cell_y
|
||||
|
||||
if cell_w <= 0 or cell_h <= 0:
|
||||
continue
|
||||
|
||||
cell_region = PageRegion(
|
||||
type=col.type,
|
||||
x=cell_x, y=cell_y,
|
||||
width=cell_w, height=cell_h,
|
||||
)
|
||||
|
||||
cell_lang = lang_map.get(col.type, lang)
|
||||
words = ocr_region(ocr_img, cell_region, lang=cell_lang, psm=7)
|
||||
|
||||
# Sort words by x position, join to text
|
||||
words.sort(key=lambda w: w['left'])
|
||||
text = ' '.join(w['text'] for w in words)
|
||||
if words:
|
||||
avg_conf = sum(w['conf'] for w in words) / len(words)
|
||||
confidences.append(avg_conf)
|
||||
|
||||
# Bbox in percent
|
||||
cell_bbox = {
|
||||
'x': round(cell_x / img_w * 100, 2),
|
||||
'y': round(cell_y / img_h * 100, 2),
|
||||
'w': round(cell_w / img_w * 100, 2),
|
||||
'h': round(cell_h / img_h * 100, 2),
|
||||
}
|
||||
|
||||
if col.type == 'column_en':
|
||||
entry['english'] = text
|
||||
entry['bbox_en'] = cell_bbox
|
||||
elif col.type == 'column_de':
|
||||
entry['german'] = text
|
||||
entry['bbox_de'] = cell_bbox
|
||||
elif col.type == 'column_example':
|
||||
entry['example'] = text
|
||||
entry['bbox_ex'] = cell_bbox
|
||||
|
||||
entry['confidence'] = round(
|
||||
sum(confidences) / len(confidences), 1
|
||||
) if confidences else 0.0
|
||||
|
||||
# Only include if at least one field has text
|
||||
if entry['english'] or entry['german'] or entry['example']:
|
||||
entries.append(entry)
|
||||
|
||||
logger.info(f"build_word_grid: {len(entries)} entries from "
|
||||
f"{len(content_rows)} content rows × {len(relevant_cols)} columns")
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 6: Multi-Pass OCR
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user