feat(ocr-pipeline): word-based 5-column detection for vocabulary pages
Replace projection-profile layout analysis with Tesseract word bounding box clustering to detect 5-column vocabulary layouts (page_ref, EN, DE, markers, examples). Falls back to projection profiles when < 3 clusters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -64,7 +64,7 @@ export interface DewarpGroundTruth {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface PageRegion {
|
export interface PageRegion {
|
||||||
type: 'column_en' | 'column_de' | 'column_example' | 'header' | 'footer'
|
type: 'column_en' | 'column_de' | 'column_example' | 'page_ref' | 'column_marker' | 'header' | 'footer'
|
||||||
x: number
|
x: number
|
||||||
y: number
|
y: number
|
||||||
width: number
|
width: number
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ const TYPE_COLORS: Record<string, string> = {
|
|||||||
column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400',
|
column_en: 'bg-blue-100 text-blue-700 dark:bg-blue-900/30 dark:text-blue-400',
|
||||||
column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400',
|
column_de: 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-400',
|
||||||
column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400',
|
column_example: 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400',
|
||||||
|
page_ref: 'bg-purple-100 text-purple-700 dark:bg-purple-900/30 dark:text-purple-400',
|
||||||
|
column_marker: 'bg-red-100 text-red-700 dark:bg-red-900/30 dark:text-red-400',
|
||||||
header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
header: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
||||||
footer: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
footer: 'bg-gray-100 text-gray-600 dark:bg-gray-700/50 dark:text-gray-400',
|
||||||
}
|
}
|
||||||
@@ -23,6 +25,8 @@ const TYPE_LABELS: Record<string, string> = {
|
|||||||
column_en: 'EN',
|
column_en: 'EN',
|
||||||
column_de: 'DE',
|
column_de: 'DE',
|
||||||
column_example: 'Beispiel',
|
column_example: 'Beispiel',
|
||||||
|
page_ref: 'Seite',
|
||||||
|
column_marker: 'Marker',
|
||||||
header: 'Header',
|
header: 'Header',
|
||||||
footer: 'Footer',
|
footer: 'Footer',
|
||||||
}
|
}
|
||||||
@@ -32,8 +36,8 @@ export function ColumnControls({ columnResult, onRerun, onGroundTruth, onNext, i
|
|||||||
|
|
||||||
if (!columnResult) return null
|
if (!columnResult) return null
|
||||||
|
|
||||||
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column'))
|
const columns = columnResult.columns.filter((c: PageRegion) => c.type.startsWith('column') || c.type === 'page_ref')
|
||||||
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column'))
|
const headerFooter = columnResult.columns.filter((c: PageRegion) => !c.type.startsWith('column') && c.type !== 'page_ref')
|
||||||
|
|
||||||
const handleGt = (isCorrect: boolean) => {
|
const handleGt = (isCorrect: boolean) => {
|
||||||
onGroundTruth({ is_correct: isCorrect })
|
onGroundTruth({ is_correct: isCorrect })
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ CV_PIPELINE_AVAILABLE = CV2_AVAILABLE and TESSERACT_AVAILABLE
|
|||||||
@dataclass
|
@dataclass
|
||||||
class PageRegion:
|
class PageRegion:
|
||||||
"""A detected region on the page."""
|
"""A detected region on the page."""
|
||||||
type: str # 'column_en', 'column_de', 'column_example', 'header', 'footer'
|
type: str # 'column_en', 'column_de', 'column_example', 'page_ref', 'column_marker', 'header', 'footer'
|
||||||
x: int
|
x: int
|
||||||
y: int
|
y: int
|
||||||
width: int
|
width: int
|
||||||
@@ -839,6 +839,225 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
|
|||||||
return regions
|
return regions
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Stage 5b: Word-Based Layout Analysis (5-Column Detection)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> List[PageRegion]:
|
||||||
|
"""Detect columns by clustering left-aligned word positions from Tesseract.
|
||||||
|
|
||||||
|
This approach works better than projection profiles for vocabulary pages
|
||||||
|
with 5 columns (page_ref, EN, DE, markers, examples) because it detects
|
||||||
|
column starts where left-aligned words cluster.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ocr_img: Binarized grayscale image for layout analysis.
|
||||||
|
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of PageRegion objects. Falls back to analyze_layout() if < 3 clusters.
|
||||||
|
"""
|
||||||
|
h, w = ocr_img.shape[:2]
|
||||||
|
|
||||||
|
# --- Find content bounds ---
|
||||||
|
inv = cv2.bitwise_not(ocr_img)
|
||||||
|
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||||
|
content_w = right_x - left_x
|
||||||
|
content_h = bottom_y - top_y
|
||||||
|
|
||||||
|
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||||
|
left_x, right_x = 0, w
|
||||||
|
top_y, bottom_y = 0, h
|
||||||
|
content_w, content_h = w, h
|
||||||
|
|
||||||
|
logger.info(f"LayoutByWords: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||||
|
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||||||
|
|
||||||
|
# --- Get word bounding boxes from Tesseract ---
|
||||||
|
content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x]
|
||||||
|
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"LayoutByWords: Tesseract image_to_data failed: {e}, falling back")
|
||||||
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
|
return analyze_layout(layout_img, ocr_img)
|
||||||
|
|
||||||
|
# Collect left edges of recognized words (confidence > 30)
|
||||||
|
left_edges = []
|
||||||
|
word_info = [] # (left, top, width, height, text, conf)
|
||||||
|
n_words = len(data['text'])
|
||||||
|
for i in range(n_words):
|
||||||
|
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||||||
|
text = str(data['text'][i]).strip()
|
||||||
|
if conf < 30 or not text:
|
||||||
|
continue
|
||||||
|
lx = int(data['left'][i])
|
||||||
|
ty = int(data['top'][i])
|
||||||
|
bw = int(data['width'][i])
|
||||||
|
bh = int(data['height'][i])
|
||||||
|
left_edges.append(lx)
|
||||||
|
word_info.append((lx, ty, bw, bh, text, conf))
|
||||||
|
|
||||||
|
if len(left_edges) < 5:
|
||||||
|
logger.warning(f"LayoutByWords: only {len(left_edges)} words detected, falling back")
|
||||||
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
|
return analyze_layout(layout_img, ocr_img)
|
||||||
|
|
||||||
|
logger.info(f"LayoutByWords: {len(left_edges)} words detected in content area")
|
||||||
|
|
||||||
|
# --- Cluster left edges ---
|
||||||
|
tolerance = max(10, int(content_w * 0.01)) # ~1% of content width
|
||||||
|
sorted_edges = sorted(left_edges)
|
||||||
|
|
||||||
|
clusters = [] # list of (center_x, count, edges)
|
||||||
|
current_cluster = [sorted_edges[0]]
|
||||||
|
for edge in sorted_edges[1:]:
|
||||||
|
if edge - current_cluster[-1] <= tolerance:
|
||||||
|
current_cluster.append(edge)
|
||||||
|
else:
|
||||||
|
clusters.append(current_cluster)
|
||||||
|
current_cluster = [edge]
|
||||||
|
clusters.append(current_cluster)
|
||||||
|
|
||||||
|
# Filter: only clusters with >= 2 words
|
||||||
|
significant = [(int(np.mean(c)), len(c), min(c), max(c)) for c in clusters if len(c) >= 2]
|
||||||
|
significant.sort(key=lambda s: s[0])
|
||||||
|
|
||||||
|
logger.info(f"LayoutByWords: {len(significant)} significant clusters "
|
||||||
|
f"(from {len(clusters)} total): "
|
||||||
|
f"{[(s[0]+left_x, s[1]) for s in significant[:10]]}")
|
||||||
|
|
||||||
|
if len(significant) < 3:
|
||||||
|
logger.info("LayoutByWords: < 3 clusters, falling back to projection-based layout")
|
||||||
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
|
return analyze_layout(layout_img, ocr_img)
|
||||||
|
|
||||||
|
# --- Merge clusters that are very close (within 2*tolerance) ---
|
||||||
|
merged = [significant[0]]
|
||||||
|
for s in significant[1:]:
|
||||||
|
if s[0] - merged[-1][0] < 2 * tolerance:
|
||||||
|
# Merge: weighted average position, sum counts
|
||||||
|
prev = merged[-1]
|
||||||
|
total = prev[1] + s[1]
|
||||||
|
avg_x = (prev[0] * prev[1] + s[0] * s[1]) // total
|
||||||
|
merged[-1] = (avg_x, total, min(prev[2], s[2]), max(prev[3], s[3]))
|
||||||
|
else:
|
||||||
|
merged.append(s)
|
||||||
|
|
||||||
|
logger.info(f"LayoutByWords: {len(merged)} clusters after merging: "
|
||||||
|
f"{[(m[0]+left_x, m[1]) for m in merged]}")
|
||||||
|
|
||||||
|
if len(merged) < 3:
|
||||||
|
logger.info("LayoutByWords: < 3 merged clusters, falling back")
|
||||||
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
|
return analyze_layout(layout_img, ocr_img)
|
||||||
|
|
||||||
|
# --- Derive column boundaries ---
|
||||||
|
# 2mm margin before each cluster start (~8px at 100dpi, scale with image)
|
||||||
|
margin_px = max(5, int(content_w * 0.005))
|
||||||
|
|
||||||
|
col_starts = [] # (abs_x, word_count)
|
||||||
|
for center_x, count, min_edge, max_edge in merged:
|
||||||
|
abs_start = max(0, left_x + min_edge - margin_px)
|
||||||
|
col_starts.append((abs_start, count))
|
||||||
|
|
||||||
|
# Calculate column widths
|
||||||
|
col_defs = [] # (abs_x, width, word_count)
|
||||||
|
for i, (start_x, count) in enumerate(col_starts):
|
||||||
|
if i + 1 < len(col_starts):
|
||||||
|
col_width = col_starts[i + 1][0] - start_x
|
||||||
|
else:
|
||||||
|
col_width = right_x - start_x
|
||||||
|
col_defs.append((start_x, col_width, count))
|
||||||
|
|
||||||
|
logger.info(f"LayoutByWords: column definitions: "
|
||||||
|
f"{[(d[0], d[1], d[2]) for d in col_defs]}")
|
||||||
|
|
||||||
|
# --- Assign types based on rules ---
|
||||||
|
regions = []
|
||||||
|
total_content_w = right_x - left_x
|
||||||
|
untyped = list(range(len(col_defs))) # indices not yet assigned
|
||||||
|
|
||||||
|
# Rule 1: Leftmost narrow column (< 12% width) → page_ref
|
||||||
|
if col_defs[0][1] < total_content_w * 0.12:
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='page_ref', x=col_defs[0][0], y=top_y,
|
||||||
|
width=col_defs[0][1], height=content_h
|
||||||
|
))
|
||||||
|
untyped.remove(0)
|
||||||
|
logger.info(f"LayoutByWords: col 0 → page_ref (width={col_defs[0][1]}px, "
|
||||||
|
f"{col_defs[0][1]*100/total_content_w:.1f}%)")
|
||||||
|
|
||||||
|
# Rule 2: Narrow column with few words (< 8% width, <= 8 words) → column_marker
|
||||||
|
for i in list(untyped):
|
||||||
|
col_x, col_w, col_count = col_defs[i]
|
||||||
|
if col_w < total_content_w * 0.08 and col_count <= 8:
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_marker', x=col_x, y=top_y,
|
||||||
|
width=col_w, height=content_h
|
||||||
|
))
|
||||||
|
untyped.remove(i)
|
||||||
|
logger.info(f"LayoutByWords: col {i} → column_marker (width={col_w}px, "
|
||||||
|
f"{col_w*100/total_content_w:.1f}%, words={col_count})")
|
||||||
|
|
||||||
|
# Rule 3: Rightmost remaining (widest or last) → column_example
|
||||||
|
if len(untyped) >= 3:
|
||||||
|
last_idx = untyped[-1]
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_example', x=col_defs[last_idx][0], y=top_y,
|
||||||
|
width=col_defs[last_idx][1], height=content_h
|
||||||
|
))
|
||||||
|
untyped.remove(last_idx)
|
||||||
|
logger.info(f"LayoutByWords: col {last_idx} → column_example")
|
||||||
|
|
||||||
|
# Rule 4: First remaining → column_en, second → column_de
|
||||||
|
if len(untyped) >= 2:
|
||||||
|
en_idx = untyped[0]
|
||||||
|
de_idx = untyped[1]
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_en', x=col_defs[en_idx][0], y=top_y,
|
||||||
|
width=col_defs[en_idx][1], height=content_h
|
||||||
|
))
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_de', x=col_defs[de_idx][0], y=top_y,
|
||||||
|
width=col_defs[de_idx][1], height=content_h
|
||||||
|
))
|
||||||
|
untyped = untyped[2:]
|
||||||
|
logger.info(f"LayoutByWords: col {en_idx} → column_en, col {de_idx} → column_de")
|
||||||
|
elif len(untyped) == 1:
|
||||||
|
# Only one left — call it column_en
|
||||||
|
idx = untyped[0]
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_en', x=col_defs[idx][0], y=top_y,
|
||||||
|
width=col_defs[idx][1], height=content_h
|
||||||
|
))
|
||||||
|
untyped = []
|
||||||
|
|
||||||
|
# Any remaining untyped columns get generic column_example type
|
||||||
|
for idx in untyped:
|
||||||
|
regions.append(PageRegion(
|
||||||
|
type='column_example', x=col_defs[idx][0], y=top_y,
|
||||||
|
width=col_defs[idx][1], height=content_h
|
||||||
|
))
|
||||||
|
|
||||||
|
# Sort by x position for consistent output
|
||||||
|
regions.sort(key=lambda r: r.x)
|
||||||
|
|
||||||
|
# Add header/footer
|
||||||
|
if top_y > 10:
|
||||||
|
regions.append(PageRegion(type='header', x=0, y=0, width=w, height=top_y))
|
||||||
|
if bottom_y < h - 10:
|
||||||
|
regions.append(PageRegion(type='footer', x=0, y=bottom_y, width=w, height=h - bottom_y))
|
||||||
|
|
||||||
|
col_count = len([r for r in regions if r.type.startswith('column') or r.type == 'page_ref'])
|
||||||
|
logger.info(f"LayoutByWords: {col_count} columns detected: "
|
||||||
|
f"{[(r.type, r.x, r.width) for r in regions if r.type not in ('header','footer')]}")
|
||||||
|
|
||||||
|
return regions
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Stage 6: Multi-Pass OCR
|
# Stage 6: Multi-Pass OCR
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ from pydantic import BaseModel
|
|||||||
|
|
||||||
from cv_vocab_pipeline import (
|
from cv_vocab_pipeline import (
|
||||||
analyze_layout,
|
analyze_layout,
|
||||||
|
analyze_layout_by_words,
|
||||||
create_ocr_image,
|
create_ocr_image,
|
||||||
deskew_image,
|
deskew_image,
|
||||||
deskew_image_by_word_alignment,
|
deskew_image_by_word_alignment,
|
||||||
@@ -639,15 +640,11 @@ async def detect_columns(session_id: str):
|
|||||||
|
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
# Prepare images for analyze_layout
|
# Binarized image for layout analysis
|
||||||
gray = cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2GRAY)
|
|
||||||
# CLAHE-enhanced for layout analysis
|
|
||||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
|
||||||
layout_img = clahe.apply(gray)
|
|
||||||
# Binarized for text density
|
|
||||||
ocr_img = create_ocr_image(dewarped_bgr)
|
ocr_img = create_ocr_image(dewarped_bgr)
|
||||||
|
|
||||||
regions = analyze_layout(layout_img, ocr_img)
|
# Word-based detection (with automatic fallback to projection profiles)
|
||||||
|
regions = analyze_layout_by_words(ocr_img, dewarped_bgr)
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
columns = [asdict(r) for r in regions]
|
columns = [asdict(r) for r in regions]
|
||||||
@@ -740,11 +737,13 @@ async def _get_columns_overlay(session_id: str) -> Response:
|
|||||||
if img is None:
|
if img is None:
|
||||||
raise HTTPException(status_code=500, detail="Failed to decode image")
|
raise HTTPException(status_code=500, detail="Failed to decode image")
|
||||||
|
|
||||||
# Color map for region types
|
# Color map for region types (BGR)
|
||||||
colors = {
|
colors = {
|
||||||
"column_en": (255, 180, 0), # Blue (BGR)
|
"column_en": (255, 180, 0), # Blue
|
||||||
"column_de": (0, 200, 0), # Green
|
"column_de": (0, 200, 0), # Green
|
||||||
"column_example": (0, 140, 255), # Orange
|
"column_example": (0, 140, 255), # Orange
|
||||||
|
"page_ref": (200, 0, 200), # Purple
|
||||||
|
"column_marker": (0, 0, 220), # Red
|
||||||
"header": (128, 128, 128), # Gray
|
"header": (128, 128, 128), # Gray
|
||||||
"footer": (128, 128, 128), # Gray
|
"footer": (128, 128, 128), # Gray
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user