Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,257 @@
"""
Legacy layout analysis using projection profiles.
Extracted from cv_layout_columns.py — contains:
- analyze_layout() (projection-profile based column/header/footer detection)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List
import numpy as np
from cv_vocab_types import PageRegion
from cv_layout_detection import _find_content_bounds
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
"""Detect columns, header, and footer using projection profiles.
Uses content-bounds detection to exclude page margins before searching
for column separators within the actual text area.
Args:
layout_img: CLAHE-enhanced grayscale image.
ocr_img: Binarized image for text density analysis.
Returns:
List of PageRegion objects describing detected regions.
"""
h, w = ocr_img.shape[:2]
# Invert: black text on white → white text on black for projection
inv = cv2.bitwise_not(ocr_img)
# --- Find actual content bounds (exclude page margins) ---
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x
content_h = bottom_y - top_y
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
if content_w < w * 0.3 or content_h < h * 0.3:
# Fallback if detection seems wrong
left_x, right_x = 0, w
top_y, bottom_y = 0, h
content_w, content_h = w, h
# --- Vertical projection within content area to find column separators ---
content_strip = inv[top_y:bottom_y, left_x:right_x]
v_proj = np.sum(content_strip, axis=0).astype(float)
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
# Smooth the projection profile
kernel_size = max(5, content_w // 50)
if kernel_size % 2 == 0:
kernel_size += 1
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# Debug: log projection profile statistics
p_mean = float(np.mean(v_proj_smooth))
p_median = float(np.median(v_proj_smooth))
p_min = float(np.min(v_proj_smooth))
p_max = float(np.max(v_proj_smooth))
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
f"mean={p_mean:.4f}, median={p_median:.4f}")
# Find valleys using multiple threshold strategies
# Strategy 1: relative to median (catches clear separators)
# Strategy 2: local minima approach (catches subtle gaps)
threshold = max(p_median * 0.3, p_mean * 0.2)
logger.info(f"Layout: valley threshold={threshold:.4f}")
in_valley = v_proj_smooth < threshold
# Find contiguous valley regions
all_valleys = []
start = None
for x in range(len(v_proj_smooth)):
if in_valley[x] and start is None:
start = x
elif not in_valley[x] and start is not None:
valley_width = x - start
valley_depth = float(np.min(v_proj_smooth[start:x]))
# Valley must be at least 3px wide
if valley_width >= 3:
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
start = None
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)}"
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
# Filter: valleys must be inside the content area (not at edges)
inner_margin = int(content_w * 0.08)
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
# If no valleys found with strict threshold, try local minima approach
if len(valleys) < 2:
logger.info("Layout: trying local minima approach for column detection")
# Divide content into 20 segments, find the 2 lowest
seg_count = 20
seg_width = content_w // seg_count
seg_scores = []
for i in range(seg_count):
sx = i * seg_width
ex = min((i + 1) * seg_width, content_w)
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
seg_scores.append((i, sx, ex, seg_mean))
seg_scores.sort(key=lambda s: s[3])
logger.info(f"Layout: segment scores (lowest 5): "
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
# Find two lowest non-adjacent segments that create reasonable columns
candidate_valleys = []
for seg_idx, sx, ex, seg_mean in seg_scores:
# Must not be at the edges
if seg_idx <= 1 or seg_idx >= seg_count - 2:
continue
# Must be significantly lower than overall mean
if seg_mean < p_mean * 0.6:
center = (sx + ex) // 2
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
if len(candidate_valleys) >= 2:
# Pick the best pair: non-adjacent, creating reasonable column widths
candidate_valleys.sort(key=lambda v: v[2])
best_pair = None
best_score = float('inf')
for i in range(len(candidate_valleys)):
for j in range(i + 1, len(candidate_valleys)):
c1 = candidate_valleys[i][2]
c2 = candidate_valleys[j][2]
# Must be at least 20% apart
if (c2 - c1) < content_w * 0.2:
continue
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
# Each column at least 15%
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
continue
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
if score < best_score:
best_score = score
best_pair = (candidate_valleys[i], candidate_valleys[j])
if best_pair:
valleys = list(best_pair)
logger.info(f"Layout: local minima found 2 valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
logger.info(f"Layout: final {len(valleys)} valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
regions = []
if len(valleys) >= 2:
# 3-column layout detected
valleys.sort(key=lambda v: v[2])
if len(valleys) == 2:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
else:
# Pick the two valleys that best divide into 3 parts
# Prefer wider valleys (more likely true separators)
best_pair = None
best_score = float('inf')
for i in range(len(valleys)):
for j in range(i + 1, len(valleys)):
c1, c2 = valleys[i][2], valleys[j][2]
# Each column should be at least 15% of content width
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
continue
# Score: lower is better (more even distribution)
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
# Bonus for wider valleys (subtract valley width)
score -= (valleys[i][3] + valleys[j][3]) * 0.5
if score < best_score:
best_score = score
best_pair = (c1, c2)
if best_pair:
sep1_center, sep2_center = best_pair
else:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
# Convert from content-relative to absolute coordinates
abs_sep1 = sep1_center + left_x
abs_sep2 = sep2_center + left_x
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep1, y=top_y,
width=abs_sep2 - abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_example', x=abs_sep2, y=top_y,
width=w - abs_sep2, height=content_h
))
elif len(valleys) == 1:
# 2-column layout
abs_sep = valleys[0][2] + left_x
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep, y=top_y,
width=w - abs_sep, height=content_h
))
else:
# No columns detected — run full-page OCR as single column
logger.warning("Layout: no column separators found, using full page")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=w, height=content_h
))
# Add header/footer info (gap-based detection with fallback)
# Lazy import to avoid circular dependency with cv_layout.py
from cv_layout_detection import _add_header_footer
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
col_count = len([r for r in regions if r.type.startswith('column')])
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
return regions

View File

@@ -0,0 +1,494 @@
"""
Column type classification for OCR layout analysis.
Entry point: classify_column_types() with 4-level fallback chain.
Also provides positional_column_regions() and _build_margin_regions().
Position-based classifiers (Level 2+3) in cv_layout_classify_position.py.
"""
import logging
from typing import Dict, List, Optional
import numpy as np
from cv_vocab_types import ColumnGeometry, PageRegion
from cv_layout_scoring import (
_score_language,
_score_role,
_score_dictionary_signals,
_classify_dictionary_columns,
)
from cv_layout_classify_position import (
_classify_by_position_enhanced,
_classify_by_position_fallback,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Margin Region Building
# ---------------------------------------------------------------------------
def _build_margin_regions(
all_regions: List[PageRegion],
left_x: int,
right_x: int,
img_w: int,
top_y: int,
content_h: int,
) -> List[PageRegion]:
"""Create margin_left / margin_right PageRegions from content bounds.
Margins represent the space between the image edge and the first/last
content column. They are used downstream for faithful page
reconstruction but are skipped during OCR.
"""
margins: List[PageRegion] = []
# Minimum gap (px) to create a margin region
_min_gap = 5
if left_x > _min_gap:
margins.append(PageRegion(
type='margin_left', x=0, y=top_y,
width=left_x, height=content_h,
classification_confidence=1.0,
classification_method='content_bounds',
))
# Right margin: from end of last content column to image edge
non_margin = [r for r in all_regions
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
'margin_top', 'margin_bottom')]
if non_margin:
last_col_end = max(r.x + r.width for r in non_margin)
else:
last_col_end = right_x
if img_w - last_col_end > _min_gap:
margins.append(PageRegion(
type='margin_right', x=last_col_end, y=top_y,
width=img_w - last_col_end, height=content_h,
classification_confidence=1.0,
classification_method='content_bounds',
))
if margins:
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
return margins
# ---------------------------------------------------------------------------
# Positional Column Regions
# ---------------------------------------------------------------------------
def positional_column_regions(
geometries: List[ColumnGeometry],
content_w: int,
content_h: int,
left_x: int,
) -> List[PageRegion]:
"""Classify columns by position only (no language scoring).
Structural columns (page_ref, column_marker) are identified by geometry.
Remaining content columns are labelled left->right as column_en, column_de,
column_example. The names are purely positional -- no language analysis.
"""
structural: List[PageRegion] = []
content_cols: List[ColumnGeometry] = []
for g in geometries:
rel_x = g.x - left_x
# page_ref: narrow column in the leftmost 20% region
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
structural.append(PageRegion(
type='page_ref', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
# column_marker: very narrow, few words
elif g.width_ratio < 0.06 and g.word_count <= 15:
structural.append(PageRegion(
type='column_marker', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
# empty or near-empty narrow column -> treat as margin/structural
elif g.word_count <= 2 and g.width_ratio < 0.15:
structural.append(PageRegion(
type='column_marker', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.85,
classification_method='positional',
))
else:
content_cols.append(g)
# Single content column -> plain text page
if len(content_cols) == 1:
g = content_cols[0]
return structural + [PageRegion(
type='column_text', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.9,
classification_method='positional',
)]
# No content columns
if not content_cols:
return structural
# Sort content columns left->right and assign positional labels
content_cols.sort(key=lambda g: g.x)
# With exactly 2 content columns: if the left one is very wide (>35%),
# it likely contains EN+DE combined, so the right one is examples.
if (len(content_cols) == 2
and content_cols[0].width_ratio > 0.35
and content_cols[1].width_ratio > 0.20):
labels = ['column_en', 'column_example']
else:
labels = ['column_en', 'column_de', 'column_example']
regions = list(structural)
for i, g in enumerate(content_cols):
label = labels[i] if i < len(labels) else 'column_example'
regions.append(PageRegion(
type=label, x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='positional',
))
logger.info(f"PositionalColumns: {len(structural)} structural, "
f"{len(content_cols)} content -> "
f"{[r.type for r in regions]}")
return regions
# ---------------------------------------------------------------------------
# Main Classification Entry Point
# ---------------------------------------------------------------------------
def classify_column_types(geometries: List[ColumnGeometry],
content_w: int,
top_y: int,
img_w: int,
img_h: int,
bottom_y: int,
left_x: int = 0,
right_x: int = 0,
inv: Optional[np.ndarray] = None,
document_category: Optional[str] = None,
margin_strip_detected: bool = False) -> List[PageRegion]:
"""Classify column types using a 3-level fallback chain.
Level 0: Dictionary detection (if signals are strong enough)
Level 1: Content-based (language + role scoring)
Level 2: Position + language (old rules enhanced with language detection)
Level 3: Pure position (exact old code, no regression)
Args:
geometries: List of ColumnGeometry from Phase A.
content_w: Total content width.
top_y: Top Y of content area.
img_w: Full image width.
img_h: Full image height.
bottom_y: Bottom Y of content area.
left_x: Left content bound (from _find_content_bounds).
right_x: Right content bound (from _find_content_bounds).
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
List of PageRegion with types, confidence, and method.
"""
# _add_header_footer lives in cv_layout (avoids circular import at module
# level). Lazy-import here so the module can be tested independently when
# cv_layout hasn't been modified yet.
from cv_layout_detection import _add_header_footer # noqa: E402
content_h = bottom_y - top_y
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
"""Append margin_left / margin_right regions to *result*."""
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
return result + margins
# Special case: single column -> plain text page
if len(geometries) == 1:
geom = geometries[0]
return _with_margins([PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=geom.height,
classification_confidence=0.9,
classification_method='content',
)])
# --- Pre-filter: first/last columns with very few words -> column_ignore ---
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
# have few words (page refs, markers) and should not be discarded.
ignore_regions = []
active_geometries = []
for idx, g in enumerate(geometries):
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
ignore_regions.append(PageRegion(
type='column_ignore', x=g.x, y=g.y,
width=g.width, height=content_h,
classification_confidence=0.95,
classification_method='content',
))
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) -> column_ignore (edge, few words)")
else:
active_geometries.append(g)
# Re-index active geometries for classification
for new_idx, g in enumerate(active_geometries):
g.index = new_idx
geometries = active_geometries
# Handle edge case: all columns ignored or only 1 left
if len(geometries) == 0:
return _with_margins(ignore_regions)
if len(geometries) == 1:
geom = geometries[0]
ignore_regions.append(PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=geom.height,
classification_confidence=0.9,
classification_method='content',
))
return _with_margins(ignore_regions)
# --- Score all columns ---
lang_scores = [_score_language(g.words) for g in geometries]
role_scores = [_score_role(g) for g in geometries]
logger.info(f"ClassifyColumns: language scores: "
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
logger.info(f"ClassifyColumns: role scores: "
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
# --- Level 0: Dictionary detection ---
dict_signals = _score_dictionary_signals(
geometries,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dict_signals["is_dictionary"]:
regions = _classify_dictionary_columns(
geometries, dict_signals, lang_scores, content_h,
)
if regions is not None:
logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
dict_signals["confidence"])
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 1: Content-based classification ---
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 2: Position + language enhanced ---
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
if regions is not None:
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# --- Level 3: Pure position fallback (old code, no regression) ---
logger.info("ClassifyColumns: Level 3 (position fallback)")
regions = _classify_by_position_fallback(geometries, content_w, content_h)
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
return _with_margins(ignore_regions + regions)
# ---------------------------------------------------------------------------
# Level 1: Content-Based Classification
# ---------------------------------------------------------------------------
def _classify_by_content(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
role_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 1: Classify columns purely by content analysis.
Requires clear language signals to distinguish EN/DE columns.
Returns None if language signals are too weak.
"""
regions = []
assigned = set()
# Step 1: Assign structural roles first (reference, marker)
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
is_left_side = geom.x < left_20_threshold
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
regions.append(PageRegion(
type='page_ref', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['reference'],
classification_method='content',
))
assigned.add(i)
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=rs['marker'],
classification_method='content',
))
assigned.add(i)
elif geom.width_ratio < 0.05 and not is_left_side:
# Narrow column on the right side -> marker, not page_ref
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.8,
classification_method='content',
))
assigned.add(i)
# Step 2: Among remaining columns, find EN and DE by language scores
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
for i in range(len(geometries)) if i not in assigned]
if len(remaining) < 2:
# Not enough columns for EN/DE pair
if len(remaining) == 1:
i, geom, ls, rs = remaining[0]
regions.append(PageRegion(
type='column_text', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.6,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
# Check if we have enough language signal
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
if max_eng < 0.15 and max_deu < 0.15:
# Both signals weak -- fall back to positional: left=EN, right=DE
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
en_conf = 0.4
de_conf = 0.4
regions.append(PageRegion(
type='column_en', x=best_en[1].x, y=best_en[1].y,
width=best_en[1].width, height=content_h,
classification_confidence=en_conf,
classification_method='content',
))
assigned.add(best_en[0])
regions.append(PageRegion(
type='column_de', x=best_de[1].x, y=best_de[1].y,
width=best_de[1].width, height=content_h,
classification_confidence=de_conf,
classification_method='content',
))
assigned.add(best_de[0])
# Assign remaining as example
for i, geom, ls, rs in remaining:
if i not in assigned:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.4,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions
if not en_candidates or not de_candidates:
# Language signals too weak for content-based classification
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
return None
# Pick the best EN and DE candidates
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
# Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
# Example sentences contain English function words ("the", "a", "is") which inflate
# the eng score of the Example column. When the best EN candidate sits to the RIGHT
# of the DE column and there is another EN candidate to the LEFT, prefer the left one
# -- it is almost certainly the real vocabulary column.
if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
if left_of_de:
alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
logger.info(
f"ClassifyColumns: Level 1 position fix -- best EN col {best_en[0]} "
f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
best_en = alt_en
if best_en[0] == best_de[0]:
# Same column scored highest for both -- ambiguous
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
return None
en_conf = best_en[2]['eng']
de_conf = best_de[2]['deu']
regions.append(PageRegion(
type='column_en', x=best_en[1].x, y=best_en[1].y,
width=best_en[1].width, height=content_h,
classification_confidence=round(en_conf, 2),
classification_method='content',
))
assigned.add(best_en[0])
regions.append(PageRegion(
type='column_de', x=best_de[1].x, y=best_de[1].y,
width=best_de[1].width, height=content_h,
classification_confidence=round(de_conf, 2),
classification_method='content',
))
assigned.add(best_de[0])
# Step 3: Remaining columns -> example or text based on role scores
for i, geom, ls, rs in remaining:
if i in assigned:
continue
if rs['sentence'] > 0.4:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(rs['sentence'], 2),
classification_method='content',
))
else:
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='content',
))
regions.sort(key=lambda r: r.x)
return regions

View File

@@ -0,0 +1,218 @@
"""
Position-based column type classification for OCR layout analysis.
Contains Level 2 and Level 3 classification functions:
Level 2 _classify_by_position_enhanced: Position + language confirmation
Level 3 _classify_by_position_fallback: Pure positional (no regression)
Extracted from cv_layout_classify.py during file-size split.
"""
import logging
from typing import Dict, List, Optional
from cv_vocab_types import ColumnGeometry, PageRegion
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Level 2: Position-Enhanced Classification
# ---------------------------------------------------------------------------
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
lang_scores: List[Dict[str, float]],
content_w: int,
content_h: int) -> Optional[List[PageRegion]]:
"""Level 2: Position-based rules enhanced with language confirmation.
Uses the old positional heuristics but confirms EN/DE assignment
with language scores (swapping if needed).
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
g0 = geometries[0]
ls0 = lang_scores[0]
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=0.8,
classification_method='position_enhanced',
))
untyped.remove(0)
# Rule 2: Narrow columns with few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.7,
classification_method='position_enhanced',
))
untyped.remove(last_idx)
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
if len(untyped) >= 2:
idx_a = untyped[0]
idx_b = untyped[1]
ls_a = lang_scores[idx_a]
ls_b = lang_scores[idx_b]
# Default: first=EN, second=DE (old behavior)
en_idx, de_idx = idx_a, idx_b
conf = 0.7
# Swap if language signals clearly indicate the opposite
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
en_idx, de_idx = idx_b, idx_a
conf = 0.85
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=conf,
classification_method='position_enhanced',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
untyped = []
# Remaining -> example
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=0.5,
classification_method='position_enhanced',
))
regions.sort(key=lambda r: r.x)
return regions
# ---------------------------------------------------------------------------
# Level 3: Position Fallback Classification
# ---------------------------------------------------------------------------
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
content_w: int,
content_h: int) -> List[PageRegion]:
"""Level 3: Pure position-based fallback (identical to old code).
Guarantees no regression from the previous behavior.
"""
regions = []
untyped = list(range(len(geometries)))
first_x = geometries[0].x if geometries else 0
left_20_threshold = first_x + content_w * 0.20
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
g0 = geometries[0]
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
regions.append(PageRegion(
type='page_ref', x=g0.x, y=g0.y,
width=g0.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(0)
# Rule 2: Narrow + few words -> marker
for i in list(untyped):
geom = geometries[i]
if geom.width_ratio < 0.06 and geom.word_count <= 15:
regions.append(PageRegion(
type='column_marker', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(i)
# Rule 3: Rightmost remaining -> example (if 3+)
if len(untyped) >= 3:
last_idx = untyped[-1]
geom = geometries[last_idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped.remove(last_idx)
# Rule 4: First remaining -> EN, second -> DE
if len(untyped) >= 2:
en_idx = untyped[0]
de_idx = untyped[1]
regions.append(PageRegion(
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
width=geometries[en_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.append(PageRegion(
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
width=geometries[de_idx].width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = untyped[2:]
elif len(untyped) == 1:
idx = untyped[0]
geom = geometries[idx]
regions.append(PageRegion(
type='column_en', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
untyped = []
for idx in untyped:
geom = geometries[idx]
regions.append(PageRegion(
type='column_example', x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=1.0,
classification_method='position_fallback',
))
regions.sort(key=lambda r: r.x)
return regions

View File

@@ -0,0 +1,458 @@
"""
Post-processing refinements for column geometry.
Extracted from cv_layout_columns.py — contains:
- _detect_sub_columns() (sub-column detection via left-edge alignment)
- _split_broad_columns() (broad column splitting via word-coverage gaps)
- expand_narrow_columns() (narrow column expansion into whitespace)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import statistics
from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
logger = logging.getLogger(__name__)
def _detect_sub_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
top_y: int = 0,
header_y: Optional[int] = None,
footer_y: Optional[int] = None,
_edge_tolerance: int = 8,
_min_col_start_ratio: float = 0.10,
) -> List[ColumnGeometry]:
"""Split columns that contain internal sub-columns based on left-edge alignment.
For each column, clusters word left-edges into alignment bins (within
``_edge_tolerance`` px). The leftmost bin whose word count reaches
``_min_col_start_ratio`` of the column total is treated as the true column
start. Any words to the left of that bin form a sub-column, provided they
number >= 2 and < 35 % of total.
Word ``left`` values are relative to the content ROI (offset by *left_x*),
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
bridges the two coordinate systems.
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
in header/footer regions are excluded from alignment clustering to avoid
polluting the bins with page numbers or chapter titles. Word ``top``
values are relative to *top_y*.
Returns a new list of ColumnGeometry — potentially longer than the input.
"""
if content_w <= 0:
return geometries
result: List[ColumnGeometry] = []
for geo in geometries:
# Only consider wide-enough columns with enough words
if geo.width_ratio < 0.15 or geo.word_count < 5:
result.append(geo)
continue
# Collect left-edges of confident words, excluding header/footer
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
min_top_rel = (header_y - top_y) if header_y is not None else None
max_top_rel = (footer_y - top_y) if footer_y is not None else None
confident = [w for w in geo.words
if w.get('conf', 0) >= 30
and (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(confident) < 3:
result.append(geo)
continue
# --- Cluster left-edges into alignment bins ---
sorted_edges = sorted(w['left'] for w in confident)
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
cur = [sorted_edges[0]]
for i in range(1, len(sorted_edges)):
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
cur.append(sorted_edges[i])
else:
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
cur = [sorted_edges[i]]
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
# --- Find the leftmost bin qualifying as a real column start ---
total = len(confident)
min_count = max(3, int(total * _min_col_start_ratio))
col_start_bin = None
for b in bins:
if b[1] >= min_count:
col_start_bin = b
break
if col_start_bin is None:
result.append(geo)
continue
# Words to the left of the column-start bin are sub-column candidates
split_threshold = col_start_bin[2] - _edge_tolerance
sub_words = [w for w in geo.words if w['left'] < split_threshold]
main_words = [w for w in geo.words if w['left'] >= split_threshold]
# Count only body words (excluding header/footer) for the threshold check
# so that header/footer words don't artificially trigger a split.
sub_body = [w for w in sub_words
if (min_top_rel is None or w['top'] >= min_top_rel)
and (max_top_rel is None or w['top'] <= max_top_rel)]
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
result.append(geo)
continue
# --- Guard against inline markers (bullet points, numbering) ---
# Bullet points like "1.", "2.", "•", "-" sit close to the main
# column text and are part of the cell, not a separate column.
# Only split if the horizontal gap between the rightmost sub-word
# and the main column start is large enough.
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
gap_to_main = col_start_bin[2] - max_sub_right # px gap
median_heights = [w.get('height', 20) for w in confident]
med_h = statistics.median(median_heights) if median_heights else 20
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
if gap_to_main < min_gap:
logger.debug(
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
"(likely inline markers, not a sub-column)",
geo.index, gap_to_main, min_gap)
result.append(geo)
continue
# --- Build two sub-column geometries ---
# Word 'left' values are relative to left_x; geo.x is absolute.
# Convert the split position from relative to absolute coordinates.
max_sub_left = max(w['left'] for w in sub_words)
split_rel = (max_sub_left + col_start_bin[2]) // 2
split_abs = split_rel + left_x
sub_x = geo.x
sub_width = split_abs - geo.x
main_x = split_abs
main_width = (geo.x + geo.width) - split_abs
if sub_width <= 0 or main_width <= 0:
result.append(geo)
continue
sub_geo = ColumnGeometry(
index=0,
x=sub_x,
y=geo.y,
width=sub_width,
height=geo.height,
word_count=len(sub_words),
words=sub_words,
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
is_sub_column=True,
)
main_geo = ColumnGeometry(
index=0,
x=main_x,
y=geo.y,
width=main_width,
height=geo.height,
word_count=len(main_words),
words=main_words,
width_ratio=main_width / content_w if content_w > 0 else 0.0,
is_sub_column=True,
)
result.append(sub_geo)
result.append(main_geo)
logger.info(
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
f"(rel={split_rel}), sub={len(sub_words)} words, "
f"main={len(main_words)} words, "
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
)
# Re-index by left-to-right order
result.sort(key=lambda g: g.x)
for i, g in enumerate(result):
g.index = i
return result
def _split_broad_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int = 0,
_broad_threshold: float = 0.35,
_min_gap_px: int = 15,
_min_words_per_split: int = 5,
) -> List[ColumnGeometry]:
"""Split overly broad columns that contain two language blocks (EN+DE).
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
words inside each broad column, finds the largest horizontal gap, and splits
the column at that gap.
Args:
geometries: Column geometries from _detect_sub_columns.
content_w: Width of the content area in pixels.
left_x: Left edge of content ROI in absolute image coordinates.
_broad_threshold: Minimum width_ratio to consider a column "broad".
_min_gap_px: Minimum gap width (pixels) to trigger a split.
_min_words_per_split: Both halves must have at least this many words.
Returns:
Updated list of ColumnGeometry (possibly with more columns).
"""
result: List[ColumnGeometry] = []
logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
for geo in geometries:
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
result.append(geo)
continue
# Build word-coverage array (per pixel within column)
col_left_rel = geo.x - left_x # column left in content-relative coords
coverage = np.zeros(geo.width, dtype=np.float32)
for wd in geo.words:
# wd['left'] is relative to left_x (content ROI)
wl = wd['left'] - col_left_rel
wr = wl + wd.get('width', 0)
wl = max(0, int(wl))
wr = min(geo.width, int(wr))
if wr > wl:
coverage[wl:wr] += 1.0
# Light smoothing (kernel=3px) to avoid noise
if len(coverage) > 3:
kernel = np.ones(3, dtype=np.float32) / 3.0
coverage = np.convolve(coverage, kernel, mode='same')
# Normalise to [0, 1]
cmax = coverage.max()
if cmax > 0:
coverage /= cmax
# Find INTERNAL gaps where coverage < 0.5
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
low_mask = coverage < 0.5
all_gaps = []
_gs = None
for px in range(len(low_mask)):
if low_mask[px]:
if _gs is None:
_gs = px
else:
if _gs is not None:
all_gaps.append((_gs, px, px - _gs))
_gs = None
if _gs is not None:
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
# Filter: only internal gaps (not touching column edges)
_edge_margin = 10 # pixels from edge to ignore
internal_gaps = [g for g in all_gaps
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
f"{[g for g in all_gaps if g[2] >= 5]}, "
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
f"best={best_gap}")
if best_gap is None or best_gap[2] < _min_gap_px:
result.append(geo)
continue
gap_center = (best_gap[0] + best_gap[1]) // 2
# Split words by midpoint relative to gap
left_words = []
right_words = []
for wd in geo.words:
wl = wd['left'] - col_left_rel
mid = wl + wd.get('width', 0) / 2.0
if mid < gap_center:
left_words.append(wd)
else:
right_words.append(wd)
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
result.append(geo)
continue
# Build two new ColumnGeometry objects
split_x_abs = geo.x + gap_center
left_w = gap_center
right_w = geo.width - gap_center
left_geo = ColumnGeometry(
index=0,
x=geo.x,
y=geo.y,
width=left_w,
height=geo.height,
word_count=len(left_words),
words=left_words,
width_ratio=left_w / content_w if content_w else 0,
is_sub_column=True,
)
right_geo = ColumnGeometry(
index=0,
x=split_x_abs,
y=geo.y,
width=right_w,
height=geo.height,
word_count=len(right_words),
words=right_words,
width_ratio=right_w / content_w if content_w else 0,
is_sub_column=True,
)
logger.info(
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
f"left={len(left_words)} words (w={left_w}), "
f"right={len(right_words)} words (w={right_w})"
)
result.append(left_geo)
result.append(right_geo)
# Re-index left-to-right
result.sort(key=lambda g: g.x)
for i, g in enumerate(result):
g.index = i
return result
def expand_narrow_columns(
geometries: List[ColumnGeometry],
content_w: int,
left_x: int,
word_dicts: List[Dict],
) -> List[ColumnGeometry]:
"""Expand narrow columns into adjacent whitespace gaps.
Narrow columns (marker, page_ref, < 10% content width) often lose
content at image edges due to residual shear. This expands them toward
the neighbouring column, but never past 40% of the gap or past the
nearest word in the neighbour.
Must be called AFTER _detect_sub_columns() so that sub-column splits
(which create the narrowest columns) have already happened.
"""
_NARROW_THRESHOLD_PCT = 10.0
_MIN_WORD_MARGIN = 4
if len(geometries) < 2:
return geometries
logger.info("ExpandNarrowCols: input %d cols: %s",
len(geometries),
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
for i, g in enumerate(geometries)])
for i, g in enumerate(geometries):
col_pct = g.width / content_w * 100 if content_w > 0 else 100
if col_pct >= _NARROW_THRESHOLD_PCT:
continue
expanded = False
orig_pct = col_pct
# --- try expanding to the LEFT ---
if i > 0:
left_nb = geometries[i - 1]
# Gap can be 0 if sub-column split created adjacent columns.
# In that case, look at where the neighbor's rightmost words
# actually are — there may be unused space we can claim.
nb_words_right = [wd['left'] + wd.get('width', 0)
for wd in left_nb.words]
if nb_words_right:
rightmost_word_abs = left_x + max(nb_words_right)
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
else:
# No words in neighbor → we can take up to neighbor's start
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
if safe_left_abs < g.x:
g.width += (g.x - safe_left_abs)
g.x = safe_left_abs
expanded = True
# --- try expanding to the RIGHT ---
if i + 1 < len(geometries):
right_nb = geometries[i + 1]
nb_words_left = [wd['left'] for wd in right_nb.words]
if nb_words_left:
leftmost_word_abs = left_x + min(nb_words_left)
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
else:
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
cur_right = g.x + g.width
if safe_right_abs > cur_right:
g.width = safe_right_abs - g.x
expanded = True
if expanded:
col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width
g.words = [wd for wd in word_dicts
if col_left_rel <= wd['left'] < col_right_rel]
g.word_count = len(g.words)
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
logger.info(
"ExpandNarrowCols: col %d (%.1f%%%.1f%%) x=%d w=%d words=%d",
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
# --- Shrink overlapping neighbors to match new boundaries ---
# Left neighbor: its right edge must not exceed our new left edge
if i > 0:
left_nb = geometries[i - 1]
nb_right = left_nb.x + left_nb.width
if nb_right > g.x:
left_nb.width = g.x - left_nb.x
if left_nb.width < 0:
left_nb.width = 0
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
# Re-assign words
nb_left_rel = left_nb.x - left_x
nb_right_rel = nb_left_rel + left_nb.width
left_nb.words = [wd for wd in word_dicts
if nb_left_rel <= wd['left'] < nb_right_rel]
left_nb.word_count = len(left_nb.words)
# Right neighbor: its left edge must not be before our new right edge
if i + 1 < len(geometries):
right_nb = geometries[i + 1]
my_right = g.x + g.width
if right_nb.x < my_right:
old_right_edge = right_nb.x + right_nb.width
right_nb.x = my_right
right_nb.width = old_right_edge - right_nb.x
if right_nb.width < 0:
right_nb.width = 0
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
# Re-assign words
nb_left_rel = right_nb.x - left_x
nb_right_rel = nb_left_rel + right_nb.width
right_nb.words = [wd for wd in word_dicts
if nb_left_rel <= wd['left'] < nb_right_rel]
right_nb.word_count = len(right_nb.words)
return geometries

View File

@@ -0,0 +1,589 @@
"""
Core column detection: gap-based geometry and clustering fallback.
Extracted from the original cv_layout_columns.py — contains:
- _detect_columns_by_clustering() (fallback clustering)
- _build_geometries_from_starts() (geometry construction)
- detect_column_geometry() (main column detection)
Post-processing (sub-columns, broad-column split, narrow expansion)
lives in cv_layout_column_refine.py.
Legacy projection-profile layout lives in cv_layout_analyze.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List, Optional, Tuple
import numpy as np
from cv_vocab_types import ColumnGeometry
from cv_layout_detection import _find_content_bounds
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
try:
import pytesseract
from PIL import Image
except ImportError:
pytesseract = None # type: ignore[assignment]
Image = None # type: ignore[assignment,misc]
# =============================================================================
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
# =============================================================================
# --- Phase A: Geometry Detection ---
def _detect_columns_by_clustering(
word_dicts: List[Dict],
left_edges: List[int],
edge_word_indices: List[int],
content_w: int,
content_h: int,
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
inv: Optional[np.ndarray] = None,
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
"""Fallback: detect columns by clustering left-aligned word positions.
Used when the primary gap-based algorithm finds fewer than 2 gaps.
"""
tolerance = max(10, int(content_w * 0.01))
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
clusters = []
cluster_widxs = []
cur_edges = [sorted_pairs[0][0]]
cur_widxs = [sorted_pairs[0][1]]
for edge, widx in sorted_pairs[1:]:
if edge - cur_edges[-1] <= tolerance:
cur_edges.append(edge)
cur_widxs.append(widx)
else:
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
cur_edges = [edge]
cur_widxs = [widx]
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
MIN_Y_COVERAGE_PRIMARY = 0.30
MIN_Y_COVERAGE_SECONDARY = 0.15
MIN_WORDS_SECONDARY = 5
cluster_infos = []
for c_edges, c_widxs in zip(clusters, cluster_widxs):
if len(c_edges) < 2:
continue
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
y_span = max(y_positions) - min(y_positions)
y_coverage = y_span / content_h if content_h > 0 else 0.0
cluster_infos.append({
'mean_x': int(np.mean(c_edges)),
'count': len(c_edges),
'min_edge': min(c_edges),
'max_edge': max(c_edges),
'y_min': min(y_positions),
'y_max': max(y_positions),
'y_coverage': y_coverage,
})
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
primary_set = set(id(c) for c in primary)
secondary = [c for c in cluster_infos
if id(c) not in primary_set
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
and c['count'] >= MIN_WORDS_SECONDARY]
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
if len(significant) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
return None
merge_distance = max(30, int(content_w * 0.06))
merged = [significant[0].copy()]
for s in significant[1:]:
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
prev = merged[-1]
total = prev['count'] + s['count']
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
prev['mean_x'] = avg_x
prev['count'] = total
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
else:
merged.append(s.copy())
if len(merged) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
return None
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
margin_px = max(6, int(content_w * 0.003))
return _build_geometries_from_starts(
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
)
def _build_geometries_from_starts(
col_starts: List[Tuple[int, int]],
word_dicts: List[Dict],
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
content_w: int,
content_h: int,
inv: Optional[np.ndarray] = None,
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
geometries = []
for i, (start_x, count) in enumerate(col_starts):
if i + 1 < len(col_starts):
col_width = col_starts[i + 1][0] - start_x
else:
col_width = right_x - start_x
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
"""Detect column geometry using whitespace-gap analysis with word validation.
Phase A of the two-phase column detection. Uses vertical projection
profiles to find whitespace gaps between columns, then validates that
no gap cuts through a word bounding box.
Falls back to clustering-based detection if fewer than 2 gaps are found.
Args:
ocr_img: Binarized grayscale image for layout analysis.
dewarped_bgr: Original BGR image (for Tesseract word detection).
Returns:
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
or None if detection fails entirely.
"""
h, w = ocr_img.shape[:2]
# --- Step 1: Find content bounds ---
inv = cv2.bitwise_not(ocr_img)
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x
content_h = bottom_y - top_y
if content_w < w * 0.3 or content_h < h * 0.3:
left_x, right_x = 0, w
top_y, bottom_y = 0, h
content_w, content_h = w, h
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
# --- Step 2: Get word bounding boxes from Tesseract ---
# Crop from left_x to full image width (not right_x) so words at the right
# edge of the last column are included even if they extend past the detected
# content boundary (right_x).
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
try:
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
except Exception as e:
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
return None
word_dicts = []
left_edges = []
edge_word_indices = []
n_words = len(data['text'])
for i in range(n_words):
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
text = str(data['text'][i]).strip()
if conf < 30 or not text:
continue
lx = int(data['left'][i])
ty = int(data['top'][i])
bw = int(data['width'][i])
bh = int(data['height'][i])
left_edges.append(lx)
edge_word_indices.append(len(word_dicts))
word_dicts.append({
'text': text, 'conf': conf,
'left': lx, 'top': ty, 'width': bw, 'height': bh,
})
if len(left_edges) < 5:
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
return None
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
# --- Step 2b: Segment by sub-headers ---
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
# text bands that pollute the vertical projection. We detect large
# horizontal gaps (= whitespace rows separating sections) and use only
# the tallest content segment for the projection. This makes column
# detection immune to sub-headers, illustrations, and section dividers.
content_strip = inv[top_y:bottom_y, left_x:right_x]
h_proj_row = np.sum(content_strip, axis=1).astype(float)
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
# Find horizontal gaps (near-empty rows)
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
h_in_gap = h_proj_row_norm < H_GAP_THRESH
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
h_gaps: List[Tuple[int, int]] = []
h_gap_start = None
for y_idx in range(len(h_in_gap)):
if h_in_gap[y_idx]:
if h_gap_start is None:
h_gap_start = y_idx
else:
if h_gap_start is not None:
if y_idx - h_gap_start >= H_MIN_GAP:
h_gaps.append((h_gap_start, y_idx))
h_gap_start = None
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
h_gaps.append((h_gap_start, len(h_in_gap)))
# Identify "large" gaps (significantly bigger than median) that indicate
# section boundaries (sub-headers, chapter titles).
if len(h_gaps) >= 3:
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
median_gap_h = gap_sizes[len(gap_sizes) // 2]
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
else:
large_gaps = h_gaps
# Build content segments between large gaps and pick the tallest
seg_boundaries = [0]
for gs, ge in large_gaps:
seg_boundaries.append(gs)
seg_boundaries.append(ge)
seg_boundaries.append(content_h)
segments = []
for i in range(0, len(seg_boundaries) - 1, 2):
seg_top = seg_boundaries[i]
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
seg_height = seg_bot - seg_top
if seg_height > 20: # ignore tiny fragments
segments.append((seg_top, seg_bot, seg_height))
if segments:
segments.sort(key=lambda s: s[2], reverse=True)
best_seg = segments[0]
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
effective_h = best_seg[2]
if len(segments) > 1:
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
else:
proj_strip = content_strip
effective_h = content_h
# --- Step 3: Vertical projection profile ---
v_proj = np.sum(proj_strip, axis=0).astype(float)
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
# Smooth the projection to avoid noise-induced micro-gaps
kernel_size = max(5, content_w // 80)
if kernel_size % 2 == 0:
kernel_size += 1 # keep odd for symmetry
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# --- Step 4: Find whitespace gaps ---
# Threshold: areas with very little ink density are gaps
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
gap_threshold = max(median_density * 0.15, 0.005)
in_gap = v_smooth < gap_threshold
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
# Collect contiguous gap regions
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
gap_start = None
for x in range(len(in_gap)):
if in_gap[x]:
if gap_start is None:
gap_start = x
else:
if gap_start is not None:
gap_width = x - gap_start
if gap_width >= MIN_GAP_WIDTH:
raw_gaps.append((gap_start, x))
gap_start = None
# Handle gap at the right edge
if gap_start is not None:
gap_width = len(in_gap) - gap_start
if gap_width >= MIN_GAP_WIDTH:
raw_gaps.append((gap_start, len(in_gap)))
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
f"min_width={MIN_GAP_WIDTH}px): "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
# --- Step 5: Validate gaps against word bounding boxes ---
# When using a segment for projection, only validate against words
# inside that segment — words from sub-headers or other sections
# would incorrectly overlap with real column gaps.
if segments and len(segments) > 1:
seg_top_abs = best_seg[0] # relative to content strip
seg_bot_abs = best_seg[1]
segment_words = [wd for wd in word_dicts
if wd['top'] >= seg_top_abs
and wd['top'] + wd['height'] <= seg_bot_abs]
logger.info(f"ColumnGeometry: filtering words to segment: "
f"{len(segment_words)}/{len(word_dicts)} words")
else:
segment_words = word_dicts
validated_gaps = []
for gap_start_rel, gap_end_rel in raw_gaps:
# Check if any word overlaps with this gap region
overlapping = False
for wd in segment_words:
word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
overlapping = True
break
if not overlapping:
validated_gaps.append((gap_start_rel, gap_end_rel))
else:
# Try to shift the gap to avoid the overlapping word(s)
# Find the tightest word boundaries within the gap region
min_word_left = content_w
max_word_right = 0
for wd in segment_words:
word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
min_word_left = min(min_word_left, word_left)
max_word_right = max(max_word_right, word_right)
# Try gap before the overlapping words
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
validated_gaps.append((gap_start_rel, min_word_left))
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
# Try gap after the overlapping words
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
validated_gaps.append((max_word_right, gap_end_rel))
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
else:
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"discarded (word overlap, no room to shift)")
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
# --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
# When pixel-based projection fails (e.g. due to illustrations or colored
# bands), use word bounding boxes to find clear vertical gaps. This is
# immune to decorative graphics that Tesseract doesn't recognise as words.
if len(validated_gaps) < 2:
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
word_coverage = np.zeros(content_w, dtype=np.int32)
for wd in segment_words:
wl = max(0, wd['left'])
wr = min(wd['left'] + wd['width'], content_w)
if wr > wl:
word_coverage[wl:wr] += 1
# Smooth slightly to bridge tiny 1-2px noise gaps between words
wc_kernel = max(3, content_w // 300)
if wc_kernel % 2 == 0:
wc_kernel += 1
wc_smooth = np.convolve(word_coverage.astype(float),
np.ones(wc_kernel) / wc_kernel, mode='same')
wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
WC_MIN_GAP = max(4, content_w // 300)
wc_gaps: List[Tuple[int, int]] = []
wc_gap_start = None
for x in range(len(wc_in_gap)):
if wc_in_gap[x]:
if wc_gap_start is None:
wc_gap_start = x
else:
if wc_gap_start is not None:
if x - wc_gap_start >= WC_MIN_GAP:
wc_gaps.append((wc_gap_start, x))
wc_gap_start = None
if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
wc_gaps.append((wc_gap_start, len(wc_in_gap)))
logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
f"(min_width={WC_MIN_GAP}px): "
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
if len(wc_gaps) >= 2:
validated_gaps = wc_gaps
# --- Step 6: Fallback to clustering if too few gaps ---
if len(validated_gaps) < 2:
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
return _detect_columns_by_clustering(
word_dicts, left_edges, edge_word_indices,
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
)
# --- Step 7: Derive column boundaries from gaps ---
# Sort gaps by position
validated_gaps.sort(key=lambda g: g[0])
# Identify margin gaps (first and last) vs interior gaps
# A margin gap touches the edge of the content area (within 2% tolerance)
edge_tolerance = max(10, int(content_w * 0.02))
is_left_margin = validated_gaps[0][0] <= edge_tolerance
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
# Interior gaps define column boundaries
# Column starts at the end of a gap, ends at the start of the next gap
col_starts = []
if is_left_margin:
# First column starts after the left margin gap
first_gap_end = validated_gaps[0][1]
interior_gaps = validated_gaps[1:]
else:
# No left margin gap — first column starts at content left edge
first_gap_end = 0
interior_gaps = validated_gaps[:]
if is_right_margin:
# Last gap is right margin — don't use it as column start
interior_gaps_for_boundaries = interior_gaps[:-1]
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
else:
interior_gaps_for_boundaries = interior_gaps
right_boundary = content_w
# First column
col_starts.append(left_x + first_gap_end)
# Columns between interior gaps
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
col_starts.append(left_x + gap_end_rel)
# Count words per column region (for logging)
col_start_counts = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts):
next_start = col_starts[i + 1]
else:
# Rightmost column always extends to full image width (w).
# The page margin contains only white space — extending the OCR
# crop to the image edge is safe and prevents text near the right
# border from being cut off.
next_start = w
col_left_rel = start_x - left_x
col_right_rel = next_start - left_x
n_words_in_col = sum(1 for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel)
col_start_counts.append((start_x, n_words_in_col))
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
f"{col_start_counts}")
# --- Step 8: Build ColumnGeometry objects ---
# Determine right edge for each column
all_boundaries = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts):
end_x = col_starts[i + 1]
else:
# Rightmost column always extends to full image width (w).
end_x = w
all_boundaries.append((start_x, end_x))
geometries = []
for i, (start_x, end_x) in enumerate(all_boundaries):
col_width = end_x - start_x
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
# --- Step 9: Filter phantom narrow columns ---
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
# columns (< 3% of content width) with zero or no words. These are not
# real columns — remove them and close the gap between neighbors.
min_real_col_w = max(20, int(content_w * 0.03))
filtered_geoms = [g for g in geometries
if not (g.word_count < 3 and g.width < min_real_col_w)]
if len(filtered_geoms) < len(geometries):
n_removed = len(geometries) - len(filtered_geoms)
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
f"(width < {min_real_col_w}px and words < 3)")
# Extend each remaining column to close gaps with its right neighbor
for i, g in enumerate(filtered_geoms):
if i + 1 < len(filtered_geoms):
g.width = filtered_geoms[i + 1].x - g.x
else:
g.width = w - g.x
g.index = i
col_left_rel = g.x - left_x
col_right_rel = col_left_rel + g.width
g.words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
g.word_count = len(g.words)
geometries = filtered_geoms
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)

View File

@@ -0,0 +1,479 @@
"""
Document type detection, image preparation, content bounds, and header/footer detection.
Extracted from cv_layout.py — these are the "input-side" helpers that run before
column/row geometry analysis.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List, Optional, Tuple
import numpy as np
from cv_vocab_types import (
DocumentTypeResult,
PageRegion,
)
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# =============================================================================
# Document Type Detection
# =============================================================================
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
"""Detect whether the page is a vocab table, generic table, or full text.
Uses projection profiles and text density analysis — no OCR required.
Runs in < 2 seconds.
Args:
ocr_img: Binarized grayscale image (for projection profiles).
img_bgr: BGR color image.
Returns:
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
"""
if ocr_img is None or ocr_img.size == 0:
return DocumentTypeResult(
doc_type='full_text', confidence=0.5, pipeline='full_page',
skip_steps=['columns', 'rows'],
features={'error': 'empty image'},
)
h, w = ocr_img.shape[:2]
# --- 1. Vertical projection profile → detect column gaps ---
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
# Invert: dark pixels on white background → high values = text.
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
# Smooth the profile to avoid noise spikes
kernel_size = max(3, w // 100)
if kernel_size % 2 == 0:
kernel_size += 1
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
# Find significant vertical gaps (columns of near-zero text density)
# A gap must be at least 1% of image width and have < 5% of max density
max_density = max(vert_smooth.max(), 1)
gap_threshold = max_density * 0.05
min_gap_width = max(5, w // 100)
in_gap = False
gap_count = 0
gap_start = 0
vert_gaps = []
for x in range(w):
if vert_smooth[x] < gap_threshold:
if not in_gap:
in_gap = True
gap_start = x
else:
if in_gap:
gap_width = x - gap_start
if gap_width >= min_gap_width:
gap_count += 1
vert_gaps.append((gap_start, x, gap_width))
in_gap = False
# Filter out margin gaps (within 10% of image edges)
margin_threshold = w * 0.10
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
internal_gap_count = len(internal_gaps)
# --- 2. Horizontal projection profile → detect row gaps ---
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
h_kernel = max(3, h // 200)
if h_kernel % 2 == 0:
h_kernel += 1
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
h_max = max(horiz_smooth.max(), 1)
h_gap_threshold = h_max * 0.05
min_row_gap = max(3, h // 200)
row_gap_count = 0
in_gap = False
for y in range(h):
if horiz_smooth[y] < h_gap_threshold:
if not in_gap:
in_gap = True
gap_start = y
else:
if in_gap:
if y - gap_start >= min_row_gap:
row_gap_count += 1
in_gap = False
# --- 3. Text density distribution (4×4 grid) ---
grid_rows, grid_cols = 4, 4
cell_h, cell_w = h // grid_rows, w // grid_cols
densities = []
for gr in range(grid_rows):
for gc in range(grid_cols):
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
gc * cell_w:(gc + 1) * cell_w]
if cell.size > 0:
d = float(np.count_nonzero(cell < 128)) / cell.size
densities.append(d)
density_std = float(np.std(densities)) if densities else 0
density_mean = float(np.mean(densities)) if densities else 0
features = {
'vertical_gaps': gap_count,
'internal_vertical_gaps': internal_gap_count,
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
'row_gaps': row_gap_count,
'density_mean': round(density_mean, 4),
'density_std': round(density_std, 4),
'image_size': (w, h),
}
# --- 4. Decision tree ---
# Use internal_gap_count (excludes margin gaps) for column detection.
if internal_gap_count >= 2 and row_gap_count >= 5:
# Multiple internal vertical gaps + many row gaps → table
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
return DocumentTypeResult(
doc_type='vocab_table',
confidence=round(confidence, 2),
pipeline='cell_first',
skip_steps=[],
features=features,
)
elif internal_gap_count >= 1 and row_gap_count >= 3:
# Some internal structure, likely a table
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
return DocumentTypeResult(
doc_type='generic_table',
confidence=round(confidence, 2),
pipeline='cell_first',
skip_steps=[],
features=features,
)
elif internal_gap_count == 0:
# No internal column gaps → full text (regardless of density)
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
return DocumentTypeResult(
doc_type='full_text',
confidence=round(confidence, 2),
pipeline='full_page',
skip_steps=['columns', 'rows'],
features=features,
)
else:
# Ambiguous — default to vocab_table (most common use case)
return DocumentTypeResult(
doc_type='vocab_table',
confidence=0.5,
pipeline='cell_first',
skip_steps=[],
features=features,
)
# =============================================================================
# Image Creation (Dual Image Preparation)
# =============================================================================
def create_ocr_image(img: np.ndarray) -> np.ndarray:
"""Create a binarized image optimized for Tesseract OCR.
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
Args:
img: BGR image.
Returns:
Binary image (white text on black background inverted to black on white).
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Background normalization: divide by blurred version
bg = cv2.GaussianBlur(gray, (51, 51), 0)
normalized = cv2.divide(gray, bg, scale=255)
# Adaptive binarization
binary = cv2.adaptiveThreshold(
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 31, 10
)
# Light denoise
denoised = cv2.medianBlur(binary, 3)
return denoised
def create_layout_image(img: np.ndarray) -> np.ndarray:
"""Create a CLAHE-enhanced grayscale image for layout analysis.
Args:
img: BGR image.
Returns:
Enhanced grayscale image.
"""
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
return enhanced
# =============================================================================
# Content Bounds Detection
# =============================================================================
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
out = mask.copy()
n = len(out)
i = 0
while i < n:
if out[i]:
start = i
while i < n and out[i]:
i += 1
if (i - start) < min_width:
out[start:i] = False
else:
i += 1
return out
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
"""Find the bounding box of actual text content (excluding page margins).
Scan artefacts (thin black lines at page edges) are filtered out by
discarding contiguous projection runs narrower than 1 % of the image
dimension (min 5 px).
Returns:
Tuple of (left_x, right_x, top_y, bottom_y).
"""
h, w = inv.shape[:2]
threshold = 0.005
# --- Horizontal projection for top/bottom ---
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
h_mask = h_proj > threshold
min_h_run = max(5, h // 100)
h_mask = _filter_narrow_runs(h_mask, min_h_run)
top_y = 0
for y in range(h):
if h_mask[y]:
top_y = max(0, y - 5)
break
bottom_y = h
for y in range(h - 1, 0, -1):
if h_mask[y]:
bottom_y = min(h, y + 5)
break
# --- Vertical projection for left/right margins ---
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
v_mask = v_proj_norm > threshold
min_v_run = max(5, w // 100)
v_mask = _filter_narrow_runs(v_mask, min_v_run)
left_x = 0
for x in range(w):
if v_mask[x]:
left_x = max(0, x - 2)
break
right_x = w
for x in range(w - 1, 0, -1):
if v_mask[x]:
right_x = min(w, x + 2)
break
return left_x, right_x, top_y, bottom_y
# =============================================================================
# Header / Footer Detection
# =============================================================================
def _detect_header_footer_gaps(
inv: np.ndarray,
img_w: int,
img_h: int,
) -> Tuple[Optional[int], Optional[int]]:
"""Detect header/footer boundaries via horizontal projection gap analysis.
Scans the full-page inverted image for large horizontal gaps in the top/bottom
20% that separate header/footer content from the main body.
Returns:
(header_y, footer_y) — absolute y-coordinates.
header_y = bottom edge of header region (None if no header detected).
footer_y = top edge of footer region (None if no footer detected).
"""
HEADER_FOOTER_ZONE = 0.20
GAP_MULTIPLIER = 2.0
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
actual_h = min(inv.shape[0], img_h)
roi = inv[:actual_h, :]
h_proj = np.sum(roi, axis=1).astype(float)
proj_w = roi.shape[1]
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
# Step 2: Smoothing
kernel_size = max(3, actual_h // 200)
if kernel_size % 2 == 0:
kernel_size += 1
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# Step 3: Gap threshold
positive = h_smooth[h_smooth > 0]
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
gap_threshold = max(median_density * 0.15, 0.003)
in_gap = h_smooth < gap_threshold
MIN_GAP_HEIGHT = max(3, actual_h // 500)
# Step 4: Collect contiguous gaps
raw_gaps: List[Tuple[int, int]] = []
gap_start: Optional[int] = None
for y in range(len(in_gap)):
if in_gap[y]:
if gap_start is None:
gap_start = y
else:
if gap_start is not None:
gap_height = y - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None:
gap_height = len(in_gap) - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, len(in_gap)))
if not raw_gaps:
return None, None
# Step 5: Compute median gap size and large-gap threshold
gap_sizes = [g[1] - g[0] for g in raw_gaps]
median_gap = float(np.median(gap_sizes))
large_gap_threshold = median_gap * GAP_MULTIPLIER
# Step 6: Find largest qualifying gap in header / footer zones
# A separator gap must have content on BOTH sides — edge-touching gaps
# (e.g. dewarp padding at bottom) are not valid separators.
EDGE_MARGIN = max(5, actual_h // 400)
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
header_y: Optional[int] = None
footer_y: Optional[int] = None
best_header_size = 0
for gs, ge in raw_gaps:
if gs <= EDGE_MARGIN:
continue # skip gaps touching the top edge
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
if gap_size > best_header_size:
best_header_size = gap_size
header_y = ge # bottom edge of gap
best_footer_size = 0
for gs, ge in raw_gaps:
if ge >= actual_h - EDGE_MARGIN:
continue # skip gaps touching the bottom edge
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
if gap_size > best_footer_size:
best_footer_size = gap_size
footer_y = gs # top edge of gap
if header_y is not None:
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
if footer_y is not None:
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
return header_y, footer_y
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
min_density: float = 0.005) -> bool:
"""Check whether a horizontal strip contains meaningful ink.
Args:
inv: Inverted binarized image (white-on-black).
y_start: Top of the region (inclusive).
y_end: Bottom of the region (exclusive).
min_density: Fraction of white pixels required to count as content.
Returns:
True if the region contains text/graphics, False if empty margin.
"""
if y_start >= y_end:
return False
strip = inv[y_start:y_end, :]
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
return density > min_density
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
img_w: int, img_h: int,
inv: Optional[np.ndarray] = None) -> None:
"""Add header/footer/margin regions in-place.
Uses gap-based detection when *inv* is provided, otherwise falls back
to simple top_y/bottom_y bounds.
Region types depend on whether there is actual content (text/graphics):
- 'header' / 'footer' — region contains text (e.g. title, page number)
- 'margin_top' / 'margin_bottom' — region is empty page margin
"""
header_y: Optional[int] = None
footer_y: Optional[int] = None
if inv is not None:
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
# --- Top region ---
top_boundary = header_y if header_y is not None and header_y > 10 else (
top_y if top_y > 10 else None
)
if top_boundary is not None:
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
rtype = 'header' if has_content else 'margin_top'
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
f"(has_content={has_content})")
# --- Bottom region ---
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
bottom_y if bottom_y < img_h - 10 else None
)
if bottom_boundary is not None:
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
rtype = 'footer' if has_content else 'margin_bottom'
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
height=img_h - bottom_boundary))
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
f"height={img_h - bottom_boundary}px (has_content={has_content})")

View File

@@ -0,0 +1,329 @@
"""
Row grid regularization for document layout analysis.
Provides word-center-based row boundary refinement to improve
gap-based row detection. Extracted from cv_layout_rows.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List
import numpy as np
from cv_vocab_types import RowGeometry
logger = logging.getLogger(__name__)
def _regularize_row_grid(
rows: List['RowGeometry'],
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int,
content_w: int, content_h: int,
inv: np.ndarray,
) -> List['RowGeometry']:
"""Rebuild row boundaries from word center-lines with section-break awareness.
Instead of overlaying a rigid grid, this derives row positions bottom-up
from the words themselves:
Step A: Group all content words into line clusters by Y-proximity.
Tolerance = 40% of median gap-based row height.
Step B: For each cluster compute:
- center_y = median of (word_top + word_height/2) for all words
- letter_h = median of word heights (excluding outliers > 2× median)
Step B2: Merge clusters whose centers are closer than 30% of row height
(spurious splits from OCR jitter).
Step C: Compute pitches (distances between consecutive centers).
Detect section breaks where gap > 1.8× median pitch.
Step D: Split clusters into sections at the section breaks.
Step E: Within each section, place row boundaries at midpoints between
consecutive line centers:
- First row top = center - local_pitch/2
- Last row bottom = center + local_pitch/2
- Interior boundaries = (center_i + center_{i+1}) / 2
This ensures rows tile seamlessly without gaps or overlaps.
Step F: Re-assign words to the nearest grid row by vertical center distance.
Step G: Validate that >= 85% of words land in a grid row; otherwise
fall back to the original gap-based rows.
Step H: Merge with preserved header/footer rows and re-index.
Guard: Requires >= 5 content rows from gap-based detection to activate.
This prevents the regularizer from running on very small images (e.g.
box sub-sessions with only 3-6 rows) where the gap-based detection
is already accurate enough.
Header/footer rows from the gap-based detection are preserved.
"""
content_rows = [r for r in rows if r.row_type == 'content']
non_content = [r for r in rows if r.row_type != 'content']
if len(content_rows) < 5:
return rows
# --- Step A: Group ALL words into line clusters ---
# Collect words that belong to content rows (deduplicated)
content_words: List[Dict] = []
seen_keys: set = set()
for r in content_rows:
for w in r.words:
key = (w['left'], w['top'], w['width'], w['height'])
if key not in seen_keys:
seen_keys.add(key)
content_words.append(w)
if len(content_words) < 5:
return rows
# Compute median word height (excluding outliers like tall brackets/IPA)
word_heights = sorted(w['height'] for w in content_words)
median_wh = word_heights[len(word_heights) // 2]
# Compute median gap-based row height — this is the actual line height
# as detected by the horizontal projection. We use 40% of this as
# grouping tolerance. This is much more reliable than using word height
# alone, because words on the same line can have very different heights
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
gap_row_heights = sorted(r.height for r in content_rows)
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
# Tolerance: 40% of row height. Words on the same line should have
# centers within this range. Even if a word's bbox is taller/shorter,
# its center should stay within half a row height of the line center.
y_tol = max(10, int(median_row_h * 0.4))
# Sort by center_y, then group by proximity
words_by_center = sorted(content_words,
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
line_clusters: List[List[Dict]] = []
current_line: List[Dict] = [words_by_center[0]]
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
for w in words_by_center[1:]:
w_center = w['top'] + w['height'] / 2
if abs(w_center - current_center) <= y_tol:
current_line.append(w)
else:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
current_line = [w]
current_center = w_center
if current_line:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
if len(line_clusters) < 3:
return rows
# --- Step B: Compute center_y per cluster ---
# center_y = median of (word_top + word_height/2) across all words in cluster
# letter_h = median of word heights, but excluding outlier-height words
# (>2× median) so that tall brackets/IPA don't skew the height
cluster_info: List[Dict] = []
for cl_words in line_clusters:
centers = [w['top'] + w['height'] / 2 for w in cl_words]
# Filter outlier heights for letter_h computation
normal_heights = [w['height'] for w in cl_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in cl_words]
center_y = float(np.median(centers))
letter_h = float(np.median(normal_heights))
cluster_info.append({
'center_y_rel': center_y, # relative to content ROI
'center_y_abs': center_y + top_y, # absolute
'letter_h': letter_h,
'words': cl_words,
})
cluster_info.sort(key=lambda c: c['center_y_rel'])
# --- Step B2: Merge clusters that are too close together ---
# Even with center-based grouping, some edge cases can produce
# spurious clusters. Merge any pair whose centers are closer
# than 30% of the row height (they're definitely the same text line).
merge_threshold = max(8, median_row_h * 0.3)
merged: List[Dict] = [cluster_info[0]]
for cl in cluster_info[1:]:
prev = merged[-1]
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
# Merge: combine words, recompute center
combined_words = prev['words'] + cl['words']
centers = [w['top'] + w['height'] / 2 for w in combined_words]
normal_heights = [w['height'] for w in combined_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in combined_words]
prev['center_y_rel'] = float(np.median(centers))
prev['center_y_abs'] = prev['center_y_rel'] + top_y
prev['letter_h'] = float(np.median(normal_heights))
prev['words'] = combined_words
else:
merged.append(cl)
cluster_info = merged
if len(cluster_info) < 3:
return rows
# --- Step C: Compute pitches and detect section breaks ---
pitches: List[float] = []
for i in range(1, len(cluster_info)):
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
pitches.append(pitch)
if not pitches:
return rows
median_pitch = float(np.median(pitches))
if median_pitch <= 5:
return rows
# A section break is where the gap between line centers is much larger
# than the normal pitch (sub-headings, section titles, etc.)
BREAK_FACTOR = 1.8
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
sections: List[List[Dict]] = []
current_section: List[Dict] = [cluster_info[0]]
for i in range(1, len(cluster_info)):
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
if gap > median_pitch * BREAK_FACTOR:
sections.append(current_section)
current_section = [cluster_info[i]]
else:
current_section.append(cluster_info[i])
if current_section:
sections.append(current_section)
# --- Step E: Build row boundaries per section ---
grid_rows: List[RowGeometry] = []
for section in sections:
if not section:
continue
if len(section) == 1:
# Single-line section (likely a heading)
cl = section[0]
half_h = max(cl['letter_h'], median_pitch * 0.4)
row_top = cl['center_y_abs'] - half_h
row_bot = cl['center_y_abs'] + half_h
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
continue
# Compute local pitch for this section
local_pitches = []
for i in range(1, len(section)):
local_pitches.append(
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
)
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
# Row boundaries are placed at midpoints between consecutive centers.
# First row: top = center - local_pitch/2
# Last row: bottom = center + local_pitch/2
for i, cl in enumerate(section):
if i == 0:
row_top = cl['center_y_abs'] - local_pitch / 2
else:
# Midpoint between this center and previous center
prev_center = section[i - 1]['center_y_abs']
row_top = (prev_center + cl['center_y_abs']) / 2
if i == len(section) - 1:
row_bot = cl['center_y_abs'] + local_pitch / 2
else:
next_center = section[i + 1]['center_y_abs']
row_bot = (cl['center_y_abs'] + next_center) / 2
# Clamp to reasonable bounds
row_top = max(top_y, row_top)
row_bot = min(top_y + content_h, row_bot)
if row_bot - row_top < 5:
continue
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
if not grid_rows:
return rows
# --- Step F: Re-assign words to grid rows ---
# Words may have shifted slightly; assign each word to the row whose
# center is closest to the word's vertical center.
for gr in grid_rows:
gr.words = []
for w in content_words:
w_center = w['top'] + top_y + w['height'] / 2
best_row = None
best_dist = float('inf')
for gr in grid_rows:
row_center = gr.y + gr.height / 2
dist = abs(w_center - row_center)
if dist < best_dist:
best_dist = dist
best_row = gr
if best_row is not None and best_dist < median_pitch:
best_row.words.append(w)
for gr in grid_rows:
gr.word_count = len(gr.words)
# --- Step G: Validate ---
words_placed = sum(gr.word_count for gr in grid_rows)
if len(content_words) > 0:
match_ratio = words_placed / len(content_words)
if match_ratio < 0.85:
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
f"of words, keeping gap-based rows")
return rows
# Remove empty grid rows (no words assigned)
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
# --- Step H: Merge header/footer + re-index ---
result = list(non_content) + grid_rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
row_heights = [gr.height for gr in grid_rows]
min_h = min(row_heights) if row_heights else 0
max_h = max(row_heights) if row_heights else 0
logger.info(f"RowGrid: word-center grid applied "
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
f"{len(sections)} sections, "
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
f"was {len(content_rows)} gap-based rows)")
return result

View File

@@ -0,0 +1,352 @@
"""
Row geometry detection for document layout analysis.
Provides horizontal whitespace-gap analysis to detect text rows,
word-center grid regularization, and fallback word-grouping.
Extracted from cv_layout.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List
import numpy as np
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
from cv_vocab_types import RowGeometry
from cv_ocr_word_assembly import _group_words_into_lines
from cv_layout_row_regularize import _regularize_row_grid
logger = logging.getLogger(__name__)
# =============================================================================
# Row Geometry Detection (horizontal whitespace-gap analysis)
# =============================================================================
def detect_row_geometry(
inv: np.ndarray,
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int, bottom_y: int,
) -> List['RowGeometry']:
"""Detect row geometry using horizontal whitespace-gap analysis.
Algorithm overview (two phases):
Phase 1 — Gap-based detection (Steps 16):
1. Build a horizontal projection profile: for each y-pixel, sum the
ink density across the content width. Only pixels within/near
Tesseract word bounding boxes contribute (word_mask), so that
images/illustrations don't merge adjacent text rows.
2. Smooth the projection and find contiguous regions below a
threshold (= gaps / horizontal whitespace between text lines).
The threshold is 15% of the median non-zero density.
3. Validate gaps against word bounding boxes — discard any gap
that overlaps a word, or shift the gap boundary to avoid the word.
4. Build rows from the spans between validated gaps.
5. Detect header/footer rows: gaps in the top/bottom 15% of the
page that are >= 2× the median gap size mark section boundaries.
Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
For each word, compute its vertical center (top + height/2).
Group words into line clusters by Y-proximity (tolerance = 40% of
the median gap-based row height).
For each cluster, the line center = median of all word centers.
The "pitch" = distance between consecutive line centers.
Section breaks are detected where the pitch exceeds 1.8× the median.
Within each section, row boundaries are placed at the midpoints
between consecutive line centers:
- Row top = midpoint to previous line center (or center - pitch/2 for first)
- Row bottom = midpoint to next line center (or center + pitch/2 for last)
This ensures rows tile without gaps or overlaps.
Fallback:
If < 2 gaps are found (very dense or uniform text), falls back to
_build_rows_from_word_grouping() which groups words by Y proximity.
Args:
inv: Inverted binarized image (white text on black bg, full page).
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
left_x, right_x: Absolute X bounds of the content area.
top_y, bottom_y: Absolute Y bounds of the content area.
Returns:
List of RowGeometry objects sorted top to bottom.
"""
content_w = right_x - left_x
content_h = bottom_y - top_y
if content_h < 10 or content_w < 10:
logger.warning("detect_row_geometry: content area too small")
return []
# --- Step 1: Horizontal projection profile ---
# For each y-pixel row, sum ink density across the content width.
# A word-coverage mask ensures only pixels near Tesseract words contribute,
# so that illustrations/images don't inflate the density and merge rows.
content_strip = inv[top_y:bottom_y, left_x:right_x]
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
for wd in word_dicts:
y1 = max(0, wd['top'] - WORD_PAD_Y)
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
x1 = max(0, wd['left'])
x2 = min(content_w, wd['left'] + wd['width'])
word_mask[y1:y2, x1:x2] = 255
masked_strip = cv2.bitwise_and(content_strip, word_mask)
h_proj = np.sum(masked_strip, axis=1).astype(float)
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
# --- Step 2: Smoothing + gap threshold ---
# Smooth the projection to reduce noise, then threshold at 15% of the
# median non-zero density. Pixels below this threshold are considered
# "gap" (horizontal whitespace between text lines).
# MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
kernel_size = max(3, content_h // 200)
if kernel_size % 2 == 0:
kernel_size += 1
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
gap_threshold = max(median_density * 0.15, 0.003)
in_gap = h_smooth < gap_threshold
MIN_GAP_HEIGHT = max(3, content_h // 500)
# --- Step 3: Collect contiguous gap regions ---
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
gap_start = None
for y in range(len(in_gap)):
if in_gap[y]:
if gap_start is None:
gap_start = y
else:
if gap_start is not None:
gap_height = y - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, y))
gap_start = None
if gap_start is not None:
gap_height = len(in_gap) - gap_start
if gap_height >= MIN_GAP_HEIGHT:
raw_gaps.append((gap_start, len(in_gap)))
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
f"min_height={MIN_GAP_HEIGHT}px)")
# --- Step 4: Validate gaps against word bounding boxes ---
# A gap is valid only if no word's bounding box overlaps it vertically.
# If a word overlaps, try to shift the gap boundary above or below the
# word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
validated_gaps = []
for gap_start_rel, gap_end_rel in raw_gaps:
overlapping = False
for wd in word_dicts:
word_top = wd['top']
word_bottom = wd['top'] + wd['height']
if word_top < gap_end_rel and word_bottom > gap_start_rel:
overlapping = True
break
if not overlapping:
validated_gaps.append((gap_start_rel, gap_end_rel))
else:
# Try to shift the gap to avoid overlapping words
min_word_top = content_h
max_word_bottom = 0
for wd in word_dicts:
word_top = wd['top']
word_bottom = wd['top'] + wd['height']
if word_top < gap_end_rel and word_bottom > gap_start_rel:
min_word_top = min(min_word_top, word_top)
max_word_bottom = max(max_word_bottom, word_bottom)
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
validated_gaps.append((gap_start_rel, min_word_top))
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
validated_gaps.append((max_word_bottom, gap_end_rel))
else:
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
f"discarded (word overlap, no room to shift)")
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
# --- Fallback if too few gaps ---
if len(validated_gaps) < 2:
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
return _build_rows_from_word_grouping(
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
)
validated_gaps.sort(key=lambda g: g[0])
# --- Step 5: Header/footer detection via gap size ---
HEADER_FOOTER_ZONE = 0.15
GAP_MULTIPLIER = 2.0
gap_sizes = [g[1] - g[0] for g in validated_gaps]
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
large_gap_threshold = median_gap * GAP_MULTIPLIER
header_boundary_rel = None # y below which is header
footer_boundary_rel = None # y above which is footer
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
# Find largest gap in header zone
best_header_gap = None
for gs, ge in validated_gaps:
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
best_header_gap = (gs, ge)
if best_header_gap is not None:
header_boundary_rel = best_header_gap[1]
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
f"median_gap={median_gap:.0f}px)")
# Find largest gap in footer zone
best_footer_gap = None
for gs, ge in validated_gaps:
gap_mid = (gs + ge) / 2
gap_size = ge - gs
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
best_footer_gap = (gs, ge)
if best_footer_gap is not None:
footer_boundary_rel = best_footer_gap[0]
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
# --- Step 6: Build RowGeometry objects from gaps ---
# Rows are the spans between consecutive gaps. The gap midpoints define
# where one row ends and the next begins. Each row's height extends
# from the end of the previous gap to the start of the next gap.
row_boundaries = [] # (start_y_rel, end_y_rel)
# Top of content to first gap
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
row_boundaries.append((0, validated_gaps[0][0]))
# Between gaps
for i in range(len(validated_gaps) - 1):
row_start = validated_gaps[i][1]
row_end = validated_gaps[i + 1][0]
if row_end - row_start > 0:
row_boundaries.append((row_start, row_end))
# Last gap to bottom of content
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
row_boundaries.append((validated_gaps[-1][1], content_h))
rows = []
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
# Determine row type
row_mid = (row_start_rel + row_end_rel) / 2
if header_boundary_rel is not None and row_mid < header_boundary_rel:
row_type = 'header'
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
row_type = 'footer'
else:
row_type = 'content'
# Collect words in this row
row_words = [w for w in word_dicts
if w['top'] + w['height'] / 2 >= row_start_rel
and w['top'] + w['height'] / 2 < row_end_rel]
# Gap before this row
gap_before = 0
if idx == 0 and validated_gaps[0][0] > 0:
gap_before = validated_gaps[0][0]
elif idx > 0:
# Find the gap just before this row boundary
for gs, ge in validated_gaps:
if ge == row_start_rel:
gap_before = ge - gs
break
rows.append(RowGeometry(
index=idx,
x=left_x,
y=top_y + row_start_rel,
width=content_w,
height=row_end_rel - row_start_rel,
word_count=len(row_words),
words=row_words,
row_type=row_type,
gap_before=gap_before,
))
# --- Step 7: Word-center grid regularization ---
# Refine the gap-based rows using word vertical centers. For each word,
# compute center_y = top + height/2. Group into line clusters, compute
# the pitch (distance between consecutive line centers), and place row
# boundaries at the midpoints between centers. This gives more precise
# and evenly-spaced rows than the gap-based approach alone.
# Also detects section breaks (headings, paragraphs) where the pitch
# exceeds 1.8× the median, and handles each section independently.
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
content_w, content_h, inv)
type_counts = {}
for r in rows:
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
return rows
def _build_rows_from_word_grouping(
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int, bottom_y: int,
content_w: int, content_h: int,
) -> List['RowGeometry']:
"""Fallback: build rows by grouping words by Y position.
Uses _group_words_into_lines() with a generous tolerance.
No header/footer detection in fallback mode.
"""
if not word_dicts:
return []
y_tolerance = max(20, content_h // 100)
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
rows = []
for idx, line_words in enumerate(lines):
if not line_words:
continue
min_top = min(w['top'] for w in line_words)
max_bottom = max(w['top'] + w['height'] for w in line_words)
row_height = max_bottom - min_top
rows.append(RowGeometry(
index=idx,
x=left_x,
y=top_y + min_top,
width=content_w,
height=row_height,
word_count=len(line_words),
words=line_words,
row_type='content',
gap_before=0,
))
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
return rows

View File

@@ -0,0 +1,441 @@
"""
Language scoring, role scoring, and dictionary detection/classification.
Extracted from cv_layout.py to keep modules under 500 LOC.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from collections import Counter
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
ColumnGeometry,
ENGLISH_FUNCTION_WORDS,
GERMAN_FUNCTION_WORDS,
PageRegion,
)
logger = logging.getLogger(__name__)
# --- Dictionary / Wörterbuch Detection ---
# Article words that appear as a dedicated column in dictionaries
_DICT_ARTICLE_WORDS = {
# German articles
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
# English articles / infinitive marker
"the", "a", "an", "to",
}
# --- Phase B: Content-Based Classification ---
def _score_language(words: List[Dict]) -> Dict[str, float]:
"""Score the language of a column's words.
Analyzes function words, umlauts, and capitalization patterns
to determine whether text is English or German.
Args:
words: List of word dicts with 'text' and 'conf' keys.
Returns:
Dict with 'eng' and 'deu' scores (0.0-1.0).
"""
if not words:
return {'eng': 0.0, 'deu': 0.0}
# Only consider words with decent confidence
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
if not good_words:
return {'eng': 0.0, 'deu': 0.0}
total = len(good_words)
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
# Check for umlauts (strong German signal)
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
umlaut_count = sum(1 for t in raw_texts
for c in t if c in 'äöüÄÖÜß')
# German capitalization: nouns are capitalized mid-sentence
# Count words that start with uppercase but aren't at position 0
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
en_score = en_hits / total if total > 0 else 0.0
de_score = de_hits / total if total > 0 else 0.0
# Boost German score for umlauts
if umlaut_count > 0:
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
# Boost German score for high capitalization ratio (typical for German nouns)
if total > 5:
cap_ratio = cap_words / total
if cap_ratio > 0.3:
de_score = min(1.0, de_score + 0.1)
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
"""Score the role of a column based on its geometry and content patterns.
Args:
geom: ColumnGeometry with words and dimensions.
Returns:
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
"""
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
if not geom.words:
return scores
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
if not texts:
return scores
avg_word_len = sum(len(t) for t in texts) / len(texts)
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
digit_ratio = digit_words / len(texts) if texts else 0.0
# Reference: narrow + mostly numbers/page references
if geom.width_ratio < 0.12:
scores['reference'] = 0.5
if digit_ratio > 0.4:
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
# Marker: narrow + few short entries
if geom.width_ratio < 0.06 and geom.word_count <= 15:
scores['marker'] = 0.7
if avg_word_len < 4:
scores['marker'] = 0.9
# Very narrow non-edge column → strong marker regardless of word count
if geom.width_ratio < 0.04 and geom.index > 0:
scores['marker'] = max(scores['marker'], 0.9)
# Sentence: longer words + punctuation present
if geom.width_ratio > 0.15 and has_punctuation > 2:
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
if avg_word_len > 4:
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
# Vocabulary: medium width + medium word length
if 0.10 < geom.width_ratio < 0.45:
scores['vocabulary'] = 0.4
if 3 < avg_word_len < 8:
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
return {k: round(v, 3) for k, v in scores.items()}
def _score_dictionary_signals(
geometries: List[ColumnGeometry],
document_category: Optional[str] = None,
margin_strip_detected: bool = False,
) -> Dict[str, Any]:
"""Score dictionary-specific patterns across all columns.
Combines 4 independent signals to determine if the page is a dictionary:
1. Alphabetical ordering of words in each column
2. Article column detection (der/die/das, to)
3. First-letter uniformity (most headwords share a letter)
4. Decorative A-Z margin strip (detected upstream)
Args:
geometries: List of ColumnGeometry with words.
document_category: User-selected category (e.g. 'woerterbuch').
margin_strip_detected: Whether a decorative A-Z margin strip was found.
Returns:
Dict with 'is_dictionary', 'confidence', 'article_col_index',
'headword_col_index', and 'signals' sub-dict.
"""
result: Dict[str, Any] = {
"is_dictionary": False,
"confidence": 0.0,
"article_col_index": None,
"headword_col_index": None,
"signals": {},
}
if not geometries or len(geometries) < 2:
return result
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
best_alpha_score = 0.0
best_alpha_col = -1
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Deduplicate consecutive identical words (OCR double-reads)
deduped = [texts[0]]
for t in texts[1:]:
if t != deduped[-1]:
deduped.append(t)
if len(deduped) < 5:
continue
# Count consecutive pairs in alphabetical order
ordered_pairs = sum(
1 for i in range(len(deduped) - 1)
if deduped[i] <= deduped[i + 1]
)
alpha_score = ordered_pairs / (len(deduped) - 1)
if alpha_score > best_alpha_score:
best_alpha_score = alpha_score
best_alpha_col = geom.index
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
result["signals"]["alphabetical_col"] = best_alpha_col
# --- Signal 2: Article detection (weight 0.25) ---
# Check three patterns:
# (a) Dedicated narrow article column (der/die/das only)
# (b) Inline articles: multi-word texts starting with "der X", "die X"
# (c) High article word frequency: many individual words ARE articles
# (common when OCR splits "der Zustand" into separate word_boxes)
best_article_density = 0.0
best_article_col = -1
best_inline_article_ratio = 0.0
best_article_word_ratio = 0.0
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in geom.words
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
]
if len(texts) < 3:
continue
# (a) Dedicated article column: narrow, mostly article words
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
if geom.width_ratio <= 0.20:
density = article_count / len(texts)
if density > best_article_density:
best_article_density = density
best_article_col = geom.index
# (b) Inline articles: "der Zustand", "die Zutat", etc.
inline_count = sum(
1 for t in texts
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
)
inline_ratio = inline_count / len(texts)
if inline_ratio > best_inline_article_ratio:
best_inline_article_ratio = inline_ratio
# (c) Article word frequency in any column (for OCR-split word_boxes)
# In dictionaries, articles appear frequently among headwords
# Require at least 10% articles and >= 3 article words
if article_count >= 3:
art_ratio = article_count / len(texts)
# Only count if column has enough non-article words too
# (pure article column is handled by (a))
non_art = len(texts) - article_count
if non_art >= 3 and art_ratio > best_article_word_ratio:
best_article_word_ratio = art_ratio
# Use the strongest signal
effective_article_score = max(
best_article_density,
best_inline_article_ratio,
best_article_word_ratio * 0.8, # slight discount for raw word ratio
)
result["signals"]["article_density"] = round(best_article_density, 3)
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
result["signals"]["article_col"] = best_article_col
# --- Signal 3: First-letter uniformity (weight 0.25) ---
best_uniformity = 0.0
best_uniform_col = -1
has_letter_transition = False
for geom in geometries:
texts = [
w["text"].strip().lower()
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
]
if len(texts) < 5:
continue
# Count first letters
first_letters = [t[0] for t in texts if t[0].isalpha()]
if not first_letters:
continue
letter_counts = Counter(first_letters)
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
uniformity = most_common_count / len(first_letters)
# Check for orderly letter transitions (A→B or Y→Z)
# Group consecutive words by first letter, check if groups are in order
groups = []
current_letter = first_letters[0]
for fl in first_letters:
if fl != current_letter:
groups.append(current_letter)
current_letter = fl
groups.append(current_letter)
if len(groups) >= 2 and len(groups) <= 5:
# Check if groups are alphabetically ordered
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
has_letter_transition = True
# Boost uniformity for orderly transitions
uniformity = max(uniformity, 0.70)
if uniformity > best_uniformity:
best_uniformity = uniformity
best_uniform_col = geom.index
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
result["signals"]["uniform_col"] = best_uniform_col
result["signals"]["has_letter_transition"] = has_letter_transition
# --- Signal 4: Decorative margin strip (weight 0.15) ---
result["signals"]["margin_strip_detected"] = margin_strip_detected
# --- Combine signals ---
s1 = min(best_alpha_score, 1.0) * 0.35
s2 = min(effective_article_score, 1.0) * 0.25
s3 = min(best_uniformity, 1.0) * 0.25
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
combined = s1 + s2 + s3 + s4
# Boost if user set document_category to 'woerterbuch'
if document_category == "woerterbuch":
combined = min(1.0, combined + 0.20)
result["signals"]["category_boost"] = True
result["confidence"] = round(combined, 3)
# Threshold: combined >= 0.40 to classify as dictionary
# (at least 2 strong signals or 3 moderate ones)
if combined >= 0.40:
result["is_dictionary"] = True
# Identify headword column: best alphabetical OR best uniform
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
result["headword_col_index"] = best_alpha_col
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
result["headword_col_index"] = best_uniform_col
if best_article_col >= 0 and best_article_density >= 0.30:
result["article_col_index"] = best_article_col
# If inline articles are strong but no dedicated column, note it
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
result["signals"]["inline_articles_detected"] = True
logger.info(
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
combined, result["is_dictionary"], result["signals"],
)
return result
def _classify_dictionary_columns(
geometries: List[ColumnGeometry],
dict_signals: Dict[str, Any],
lang_scores: List[Dict[str, float]],
content_h: int,
) -> Optional[List[PageRegion]]:
"""Classify columns for a detected dictionary page.
Assigns column_headword, column_article, column_ipa, and
column_de/column_en based on dictionary signals and language scores.
Returns None if classification fails.
"""
if not dict_signals.get("is_dictionary"):
return None
regions: List[PageRegion] = []
assigned = set()
article_idx = dict_signals.get("article_col_index")
headword_idx = dict_signals.get("headword_col_index")
# 1. Assign article column if detected
if article_idx is not None:
for geom in geometries:
if geom.index == article_idx:
regions.append(PageRegion(
type="column_article",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["signals"].get("article_density", 0.5), 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 2. Assign headword column
if headword_idx is not None and headword_idx not in assigned:
for geom in geometries:
if geom.index == headword_idx:
regions.append(PageRegion(
type="column_headword",
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=round(
dict_signals["confidence"], 2),
classification_method="dictionary",
))
assigned.add(geom.index)
break
# 3. Assign remaining columns by language + content
remaining = [g for g in geometries if g.index not in assigned]
for geom in remaining:
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
# Check if column contains IPA (brackets like [, /, ˈ)
ipa_chars = sum(
1 for w in geom.words
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
)
ipa_ratio = ipa_chars / max(len(geom.words), 1)
if ipa_ratio > 0.25:
col_type = "column_ipa"
conf = round(min(1.0, ipa_ratio), 2)
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
col_type = "column_de"
conf = round(ls["deu"], 2)
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
col_type = "column_en"
conf = round(ls["eng"], 2)
else:
# Positional fallback: leftmost unassigned = EN, next = DE
left_unassigned = sorted(
[g for g in remaining if g.index not in assigned],
key=lambda g: g.x,
)
if geom == left_unassigned[0] if left_unassigned else None:
col_type = "column_en"
else:
col_type = "column_de"
conf = 0.4
regions.append(PageRegion(
type=col_type,
x=geom.x, y=geom.y,
width=geom.width, height=content_h,
classification_confidence=conf,
classification_method="dictionary",
))
assigned.add(geom.index)
regions.sort(key=lambda r: r.x)
return regions

View File

@@ -0,0 +1,493 @@
"""
Cell text filtering, column/row word assignment, and bold detection.
This module contains:
- _assign_row_words_to_columns(): spatial assignment of OCR words to grid columns
- Cell text noise filtering (_clean_cell_text, _clean_cell_text_lite, etc.)
- Bold detection via stroke-width analysis (_measure_stroke_width, _classify_bold_cells)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import re
import logging
from typing import Any, Dict, List, Optional
import numpy as np
from cv_vocab_types import PageRegion, RowGeometry
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Column / Row word assignment
# ---------------------------------------------------------------------------
def _assign_row_words_to_columns(
row: RowGeometry,
columns: List[PageRegion],
) -> Dict[int, List[Dict]]:
"""Assign each word in a row to exactly one column.
Uses a two-pass strategy:
1. Containment: if a word's center falls within a column's horizontal
bounds (with padding), assign it to that column.
2. Nearest center: for words not contained by any column, fall back to
nearest column center distance.
This prevents long sentences in wide columns (e.g. example) from having
their rightmost words stolen by an adjacent column.
Args:
row: Row with words (relative coordinates).
columns: Sorted list of columns (absolute coordinates).
Returns:
Dict mapping col_index -> list of words assigned to that column.
"""
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
if not row.words or not columns:
return result
left_x = row.x # content ROI left (absolute)
# Build non-overlapping column assignment ranges using midpoints.
# For adjacent columns, the boundary is the midpoint between them.
# This prevents words near column borders from being assigned to
# the wrong column (e.g. "We" at the start of an example sentence
# being stolen by the preceding DE column).
n = len(columns)
col_ranges_rel = [] # (assign_left, assign_right) per column
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
# Left boundary: midpoint to previous column, or 0
if ci == 0:
assign_left = 0
else:
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
assign_left = (prev_right + col_left_rel) / 2
# Right boundary: midpoint to next column, or infinity (row width)
if ci == n - 1:
assign_right = row.width + 100 # generous for last column
else:
next_left = columns[ci + 1].x - left_x
assign_right = (col_right_rel + next_left) / 2
col_ranges_rel.append((assign_left, assign_right))
for w in row.words:
w_left = w['left']
w_right = w_left + w['width']
w_center_x = w_left + w['width'] / 2
# Primary: overlap-based matching — assign to column with most overlap.
# This is more robust than center-based for narrow columns (page_ref)
# where the last character's center may fall into the next column.
best_col = -1
best_overlap = 0
for ci, col in enumerate(columns):
col_left_rel = col.x - left_x
col_right_rel = col_left_rel + col.width
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
if overlap > best_overlap:
best_overlap = overlap
best_col = ci
if best_col >= 0 and best_overlap > 0:
result[best_col].append(w)
else:
# Fallback: center-based range matching
assigned = False
for ci, (al, ar) in enumerate(col_ranges_rel):
if al <= w_center_x < ar:
result[ci].append(w)
assigned = True
break
if not assigned:
# Last resort: nearest column center
best_col = 0
col_left_0 = columns[0].x - left_x
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
for ci in range(1, n):
col_left = columns[ci].x - left_x
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
if dist < best_dist:
best_dist = dist
best_col = ci
result[best_col].append(w)
return result
# ---------------------------------------------------------------------------
# Cell text noise filtering
# ---------------------------------------------------------------------------
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
# that do NOT appear here are treated as trailing OCR noise.
_COMMON_SHORT_WORDS: set = {
# EN 1-2 letter
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
'or', 'so', 'to', 'up', 'us', 'we',
# EN 3 letter
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
'zap', 'zip', 'zoo',
# DE 2-3 letter
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
'wut', 'zum', 'zur',
}
# Known abbreviations found in EN/DE textbooks and dictionaries.
# Stored WITHOUT trailing period (the noise filter strips periods).
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
_KNOWN_ABBREVIATIONS: set = {
# EN dictionary meta-words
'sth', 'sb', 'smth', 'smb', 'sbd',
# EN general
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
# EN references / textbook
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
'ans', 'wb', 'tb', 'vocab',
# EN parts of speech / grammar
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
'syn', 'ant', 'opp', 'var', 'orig',
# EN titles
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
# EN pronunciation
'br', 'am', 'brit', 'amer',
# EN units
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
# DE general
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
'bes', 'insb', 'insbes', 'bspw', 'ca',
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
'inkl', 'exkl', 'zzgl', 'abzgl',
# DE references
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
's', 'sp', 'zit', 'zs', 'vlg',
# DE grammar
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
# DE regional
'nordd', 'österr', 'schweiz',
# Linguistic
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
}
def _is_noise_tail_token(token: str) -> bool:
"""Check if a token at the END of cell text is trailing OCR noise.
Trailing fragments are very common OCR artifacts from image edges,
borders, and neighbouring cells. This is more aggressive than a
general word filter: any short token that isn't in the dictionary
of common EN/DE words is considered noise.
Examples of noise: "Es)", "3", "ee", "B"
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
"""
t = token.strip()
if not t:
return True
# Keep ellipsis
if t in ('...', ''):
return False
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
return False
if t.endswith(']'):
return False
# Keep meaningful punctuation tokens used in textbooks
# = (definition marker), (= (definition opener), ; (separator)
if t in ('=', '(=', '=)', ';', ':', '-', '', '', '/', '+', '&'):
return False
# Pure non-alpha -> noise ("3", ")", "|")
alpha_chars = _RE_ALPHA.findall(t)
if not alpha_chars:
return True
# Extract only alpha characters for dictionary lookup
cleaned = ''.join(alpha_chars)
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
return False
# Strip normal trailing punctuation before checking for internal noise.
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." -> "cupcakes"
t_check = stripped_punct if stripped_punct else t
# Check for legitimate punctuation patterns vs. real noise.
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
# Noise: "3d", "B|", "x7"
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
# THEN check if residual contains only alpha characters.
t_inner = t_check
# Remove all parentheses, hyphens, slashes, and dots — these are normal
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
# "(zer)brechen", "wir/uns", "e.g."
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
# Now check: does the inner form still have non-alpha noise?
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
# Long alpha words (4+ chars) without internal noise are likely real
if len(cleaned) >= 4 and not has_internal_noise:
return False
# Short words: check dictionary (uses only alpha chars)
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
return False
# Default: short or suspicious -> noise
return True
def _is_garbage_text(text: str) -> bool:
"""Check if entire cell text is OCR garbage from image areas.
Garbage text = no recognizable dictionary word. Catches
"(ci]oeu", "uanoaain." etc.
"""
words = _RE_REAL_WORD.findall(text)
if not words:
# Check if any token is a known abbreviation (e.g. "e.g.")
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
if alpha_only in _KNOWN_ABBREVIATIONS:
return False
return True
for w in words:
wl = w.lower()
# Known short word or abbreviation -> not garbage
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
return False
# Long word (>= 4 chars): check vowel/consonant ratio.
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
# or "cioeu" has unusual ratios (too many or too few vowels).
if len(wl) >= 4:
vowels = sum(1 for c in wl if c in 'aeiouäöü')
ratio = vowels / len(wl)
if 0.15 <= ratio <= 0.65:
return False # plausible vowel ratio -> real word
return True
def _clean_cell_text(text: str) -> str:
"""Remove OCR noise from cell text. Generic filters:
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
2. If the entire text is garbage (no dictionary word), clear.
3. Strip trailing noise tokens from the end of the text.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
# --- Filter 3: Strip trailing noise tokens ---
tokens = stripped.split()
while tokens and _is_noise_tail_token(tokens[-1]):
tokens.pop()
if not tokens:
return ''
return ' '.join(tokens)
def _clean_cell_text_lite(text: str) -> str:
"""Simplified noise filter for cell-first OCR (isolated cell crops).
Since each cell is OCR'd in isolation (no neighbour content visible),
trailing-noise stripping is unnecessary. Only 2 filters remain:
1. No real alphabetic word (>= 2 letters) and not a known abbreviation -> empty.
2. Entire text is garbage (no dictionary word) -> empty.
"""
stripped = text.strip()
if not stripped:
return ''
# --- Filter 1: No real word at all ---
if not _RE_REAL_WORD.search(stripped):
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
if alpha_only not in _KNOWN_ABBREVIATIONS:
return ''
# --- Filter 2: Entire text is garbage ---
if _is_garbage_text(stripped):
return ''
return stripped
# ---------------------------------------------------------------------------
# Bold detection via stroke-width analysis (relative / page-level)
# ---------------------------------------------------------------------------
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
"""Measure mean stroke width in a binarised cell crop.
Returns a DPI-normalised value (mean stroke width as % of crop height),
or 0.0 if measurement is not possible.
"""
if gray_crop is None or gray_crop.size == 0:
return 0.0
h, w = gray_crop.shape[:2]
if h < 10 or w < 10:
return 0.0
# Binarise: text = white (255), background = black (0)
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
if cv2.countNonZero(bw) < 20:
return 0.0
# Distance transform: value at each white pixel = distance to nearest black
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
# Skeleton via morphological thinning
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
thin = bw.copy()
for _ in range(max(1, min(h, w) // 6)):
eroded = cv2.erode(thin, kernel)
if cv2.countNonZero(eroded) < 5:
break
thin = eroded
skeleton_pts = thin > 0
if not np.any(skeleton_pts):
return 0.0
mean_stroke = float(np.mean(dist[skeleton_pts]))
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
img_w: int, img_h: int) -> None:
"""Two-pass bold detection: measure all cells, then compare against median.
Cells with stroke width > 1.4x the page median are marked as bold.
This adapts automatically to font, DPI and scan quality.
Modifies cells in-place (sets 'is_bold' key).
"""
if ocr_img is None:
return
# Pass 1: measure stroke width for every cell with text
metrics: List[float] = []
cell_strokes: List[float] = []
for cell in cells:
sw = 0.0
if cell.get('text', '').strip():
bp = cell['bbox_px']
y1 = max(0, bp['y'])
y2 = min(img_h, bp['y'] + bp['h'])
x1 = max(0, bp['x'])
x2 = min(img_w, bp['x'] + bp['w'])
if y2 > y1 and x2 > x1:
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
cell_strokes.append(sw)
if sw > 0:
metrics.append(sw)
if len(metrics) < 3:
# Too few cells to compare — leave all as non-bold
return
median_sw = float(np.median(metrics))
if median_sw <= 0:
return
# Pass 2: cells significantly above median -> bold
for cell, sw in zip(cells, cell_strokes):
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4

View File

@@ -0,0 +1,189 @@
"""Cell-level IPA phonetic fixes for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
"""
import logging
import re
from typing import Any, Dict, List
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_insert_missing_ipa,
_replace_phonetics_in_text,
_text_has_garbled_ipa,
)
from cv_ocr_ipa_repair import (
_has_non_dict_trailing,
_insert_headword_ipa,
_strip_post_bracket_garbled,
)
logger = logging.getLogger(__name__)
def fix_cell_phonetics(
cells: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Apply IPA phonetic fixes to cell texts for overlay mode.
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
(entry['english']). But the overlay reads cell['text'] directly, so
phonetic fixes must be applied to cells too.
Processing depends on column type:
- column_en: Full processing (replace garbled IPA + strip orphan brackets
+ insert missing IPA). Safe because these cells contain only English
headwords.
- column_text: Light processing (replace garbled IPA ONLY). No orphan
bracket stripping (brackets may be German content like "(probieren)")
and no IPA insertion (would add tokens and break overlay positioning).
"""
if not IPA_AVAILABLE:
return cells
ipa_col_types = {'column_en', 'column_text'}
replaced = 0
for cell in cells:
col_type = cell.get('col_type', '')
if col_type not in ipa_col_types:
continue
text = cell.get('text', '') or ''
if not text.strip():
continue
if col_type == 'column_en':
# Full processing: replace garbled IPA, strip orphan brackets.
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
if new_text == text:
# Insert IPA when garbled phonetics exist OR when trailing
# non-dictionary words suggest garbled IPA in plain ASCII.
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
new_text = _insert_missing_ipa(text, pronunciation)
# Strip trailing garbled fragments after proper [IPA] brackets
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
if ']' in new_text:
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
else:
# column_text: replace garbled IPA, no orphan stripping
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
# Insert headword IPA ONLY if there's a gap in word_boxes
# suggesting Tesseract missed an IPA bracket on the page.
# Without gap evidence, the original page had no IPA.
if new_text == text:
wb = cell.get('word_boxes', [])
if _has_ipa_gap(text, wb):
inserted = _insert_headword_ipa(text, pronunciation)
if inserted != text:
new_text = inserted
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
if new_text != text:
logger.debug(f"fix_cell_phonetics: '{text}''{new_text}'")
cell['text'] = new_text
replaced += 1
if replaced:
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
return cells
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
"""Check if word_boxes show a gap where IPA brackets should be.
On a typical vocab page, the layout is:
headword [ipa] German translation
If Tesseract missed the IPA bracket, the gap between the headword
and the next word (German translation) is unusually large (>80px)
because the IPA occupied physical space on the page.
If no IPA was on the page (e.g. "be good at sth."), the words are
close together (<30px).
"""
if not word_boxes or len(word_boxes) < 2:
return False
tokens = text.split()
if not tokens:
return False
# Find the headword index: skip numeric prefixes like "».55", "0.56"
hw_box_idx = 0
for i, wb in enumerate(word_boxes):
wt = wb.get('text', '')
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
if len(clean) >= 2:
hw_box_idx = i
break
if hw_box_idx >= len(word_boxes) - 1:
return False
# Check gap between headword and the next word_box
hw = word_boxes[hw_box_idx]
next_wb = word_boxes[hw_box_idx + 1]
gap = next_wb['left'] - (hw['left'] + hw['width'])
return gap > 80
def _sync_word_boxes_after_ipa_insert(
cell: Dict[str, Any],
old_text: str,
new_text: str,
) -> None:
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
E.g. "challenge ...""challenge [tʃælɪndʒ] ..."
Adds a new word_box right after the headword's box so the 1:1
token-to-box mapping in the frontend overlay stays consistent.
"""
word_boxes = cell.get('word_boxes')
if not word_boxes:
return
old_tokens = old_text.split()
new_tokens = new_text.split()
if len(new_tokens) != len(old_tokens) + 1:
return # unexpected change, skip
# Find the inserted token by walking both lists in parallel.
# One token in new_tokens won't match — that's the inserted IPA.
insert_idx = -1
j = 0 # index into old_tokens
for i in range(len(new_tokens)):
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
j += 1
else:
insert_idx = i
break
if insert_idx < 0 or insert_idx >= len(new_tokens):
return
ipa_token = new_tokens[insert_idx]
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
ref_idx = insert_idx - 1
if ref_idx < 0 or ref_idx >= len(word_boxes):
return
ref_box = word_boxes[ref_idx]
ipa_box = {
'text': ipa_token,
'left': ref_box['left'] + ref_box['width'] + 2,
'top': ref_box['top'],
'width': ref_box['width'],
'height': ref_box['height'],
'conf': ref_box.get('conf', 90),
}
word_boxes.insert(insert_idx, ipa_box)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,476 @@
"""
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
provides functions to:
- Look up correct IPA pronunciations (British/American) for English words.
- Detect and replace garbled phonetic brackets with dictionary IPA.
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
- Strip orphan brackets and post-bracket garbled fragments.
- Handle IPA continuation cells (phonetics on a separate row from headword).
All IPA data comes from open-source dictionaries:
- Britfone (MIT) for British English
- eng_to_ipa / CMU (MIT) for American English
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import (
IPA_AVAILABLE,
_britfone_dict,
_ipa_convert_american,
)
logger = logging.getLogger(__name__)
# --- D. Phonetic Bracket IPA Replacement ---
# Pattern: word followed by any bracket type containing phonetic content.
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
# This intentionally matches mixed brackets (e.g. {content]) because
# Tesseract frequently misrecognizes bracket characters.
_PHONETIC_BRACKET_RE = re.compile(
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
)
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
# lookup) from garbled OCR content when stripping orphan brackets.
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
# Minimum word confidence for full-page Tesseract results (0-100).
# Words below this threshold are OCR noise (scanner shadows, borders).
_MIN_WORD_CONF = 30
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Look up IPA for a word using the selected pronunciation dictionary.
Args:
word: English word to look up.
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
Returns:
IPA string or None if not found.
"""
word_lower = word.lower().strip()
if not word_lower:
return None
if pronunciation == 'british' and _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
# Fallback to American if not in Britfone
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
if pronunciation == 'american' and _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
# Fallback to Britfone if not in CMU
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
return None
# Try any available source
if _britfone_dict:
ipa = _britfone_dict.get(word_lower)
if ipa:
return ipa
if _ipa_convert_american:
result = _ipa_convert_american(word_lower)
if result and '*' not in result:
return result
return None
def _fix_phonetic_brackets(
entries: List[Dict[str, Any]],
pronunciation: str = 'british',
) -> List[Dict[str, Any]]:
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
- British: "dance [dˈɑːns]" (Britfone, MIT)
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
Only replaces if the word before brackets is found in the dictionary.
"""
if not IPA_AVAILABLE:
return entries
# IPA phonetics only appear in the ENGLISH field of vocab tables.
# German and example fields contain meaningful parenthetical content:
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
# These must NEVER be processed as phonetic transcriptions.
replaced_count = 0
for entry in entries:
text = entry.get('english', '') or ''
if not any(ch in text for ch in '[{('):
continue
new_text = _replace_phonetics_in_text(text, pronunciation)
if new_text != text:
logger.debug(f"_fix_phonetic_brackets: '{text}''{new_text}'")
replaced_count += 1
entry['english'] = new_text
if replaced_count:
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
return entries
# Grammar particles that appear in brackets after English words:
# cross (with), complain (about/of), agree (on/with), look (sth) up
# These must NOT be replaced with IPA. Only used for the English field
# (German/example fields are never processed for IPA replacement).
_GRAMMAR_BRACKET_WORDS = frozenset({
# English prepositions/particles commonly in vocab tables
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
# English grammar abbreviations used in vocab tables
'sth', 'sb', 'adj', 'adv',
# Number/plural/grammar annotations
'pl', 'sg', 'sing', 'no', 'also', 'auch',
# Regional English markers
'ae', 'be', 'ame', 'bre',
})
def _is_grammar_bracket_content(content: str) -> bool:
"""Return True if bracket content is grammar info in the ENGLISH field.
Grammar info: cross (with), complain (about/of), agree (on/with)
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
Since we only process the English field, we only need to recognize
English grammar particles. Everything else is (garbled) IPA.
"""
if not content:
return False
# Split on / and spaces for patterns like (about/of), (no pl)
tokens = re.split(r'[/\s]+', content.strip().lower())
tokens = [t for t in tokens if t]
if not tokens:
return False
# ALL tokens must be known grammar words
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
def _replace_phonetics_in_text(
text: str,
pronunciation: str = 'british',
strip_orphans: bool = True,
) -> str:
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
We match any bracket type and replace with dictionary IPA if found.
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
Args:
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
Set to False for column_text where brackets may be German content.
"""
if not IPA_AVAILABLE:
return text
def replacer(match):
word = match.group(1)
bracket_content = match.group(2).strip()
full_match = match.group(0)
# Skip if bracket content looks like regular text (multiple words)
if len(bracket_content.split()) > 3:
return full_match
# Look up IPA for the word before brackets
ipa = _lookup_ipa(word, pronunciation)
if ipa:
# Word has IPA → bracket content is phonetic (garbled or correct).
# Exception: grammar particles like cross (with) — keep those.
if _is_grammar_bracket_content(bracket_content):
return full_match
logger.debug(f"phonetic: '{full_match}''{word} [{ipa}]'")
return f"{word} [{ipa}]"
# No IPA for this word — keep as-is
return full_match
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
if strip_orphans:
# Second pass: strip remaining orphan brackets that are garbled IPA.
# These have no word before them (the main regex requires \b word \s* bracket).
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
def _strip_orphan_bracket(m):
content = m.group(1).strip()
# Keep grammar info: (sich beschweren), (about/of)
if _is_grammar_bracket_content(content):
return m.group(0)
# Keep correct IPA (contains Unicode IPA characters)
if any(ch in _IPA_CHARS for ch in content):
return m.group(0)
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
# — they never contain a real word ≥4 letters with proper casing.
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
if len(content_alpha) >= 4:
return m.group(0)
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
return ''
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
text = text.strip()
return text
def _text_has_garbled_ipa(text: str) -> bool:
"""Check if text contains garbled IPA-like fragments from OCR.
Returns True if there is evidence of OCR-mangled phonetic
transcription, e.g. stress marks, length marks, or IPA special chars.
This is used to decide whether ``_insert_missing_ipa`` should run:
it must only insert IPA to *replace* garbled phonetics that are already
in the text — never to ADD phonetics where none existed on the page.
"""
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
stripped = text.strip()
if stripped.startswith('[') and stripped.endswith(']'):
inner = stripped[1:-1]
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
# Not a valid dictionary-style bracket like "(no pl)" — those
# use parentheses, not square brackets. Square brackets with
# no IPA chars are garbled phonetics.
return True
for w in text.strip().split():
# Skip delimiters and very short tokens
if len(w) <= 1 or w in ('', '', '-', '/', '|', ',', ';'):
continue
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
return True
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
return True
# Contains IPA length mark ':' in a short non-word fragment
if ':' in w and len(w) < 12:
# But not things like "3:00" (time) or common words
stripped = re.sub(r'[^a-zA-Z:]', '', w)
if ':' in stripped and not stripped.replace(':', '').isalpha():
continue
return True
# Contains IPA special characters
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
return True
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
# chars to avoid contractions (don't, won't, o'clock).
if "'" in w and not w.startswith("'"):
apos_idx = w.index("'")
after = w[apos_idx + 1:]
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
return True
return False
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
"""Try to decompose a compound word and concatenate IPA for each part.
E.g. "schoolbag""school"+"bag" → IPA for both concatenated.
Only returns IPA if ALL parts are found in the dictionary.
Tries splits at every position (min 3 chars per part) and picks the
split where the first part is longest.
"""
if not IPA_AVAILABLE:
return None
lower = word.lower().strip()
if len(lower) < 6:
return None # too short for a compound
best_ipa = None
best_first_len = 0
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
first = lower[:split_pos]
second = lower[split_pos:]
ipa_first = _lookup_ipa(first, pronunciation)
ipa_second = _lookup_ipa(second, pronunciation)
if ipa_first and ipa_second:
if split_pos > best_first_len:
best_first_len = split_pos
best_ipa = ipa_first + ipa_second
return best_ipa
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA pronunciation for English words that have no brackets at all.
OCR sometimes garbles the phonetic transcription into plain-text fragments
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
for the headword, inserts correct [IPA], and strips the garbled fragments.
Only inserts for words that:
- are standalone (not already followed by a bracket)
- have an IPA entry in the dictionary
- appear to be English headwords (at the start of text or after common
separators like ",", ";", "")
This is intentionally conservative: it only inserts at the END of each
whitespace-separated token group to avoid breaking phrases.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
# Skip if already has brackets (IPA replacement handles those)
if any(ch in text for ch in '[{('):
return text
# Only process short text fragments (typical vocab cells).
# Long sentences / paragraphs should not get IPA insertions.
words = text.strip().split()
if len(words) > 6:
return text
# Try to insert IPA for the first alphanumeric word
# Typical patterns: "challenge", "profit", "film", "badge"
for i, w in enumerate(words):
# Clean punctuation for lookup
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
# Skip German/grammar words
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
if not ipa and '-' in clean:
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
# Fallback 0b: compound word decomposition
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
if not ipa:
ipa = _decompose_compound(clean, pronunciation)
# Fallback 1: IPA-marker split for merged tokens where OCR
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
# backwards ≤3 chars for the onset consonant cluster, and
# split into headword + OCR IPA.
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
if not ipa:
first_marker = next(
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
)
if first_marker >= 3:
split = first_marker
while (split > 0
and split > first_marker - 3
and w[split - 1].isalpha()
and w[split - 1].islower()):
split -= 1
if split >= 2:
headword = w[:split]
ocr_ipa = w[split:]
hw_ipa = _lookup_ipa(headword, pronunciation)
if not hw_ipa:
# Try compound decomposition for the headword part
hw_ipa = _decompose_compound(headword, pronunciation)
if hw_ipa:
words[i] = f"{headword} [{hw_ipa}]"
else:
# Word not in dictionary — use OCR IPA
words[i] = f"{headword} [{ocr_ipa}]"
words = words[:i + 1]
ipa = True # signal that we handled it
break
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
# markers (e.g. "Scotland'skotland"). Find longest dictionary
# prefix using only alpha chars to avoid punctuation matches.
if not ipa:
alpha = re.sub(r'[^a-zA-Z]', '', clean)
if len(alpha) > 5: # need at least 6 chars for meaningful split
for end in range(len(alpha), 3, -1): # min prefix 4 chars
prefix = alpha[:end]
test_ipa = _lookup_ipa(prefix, pronunciation)
if test_ipa:
ipa = test_ipa
w = prefix
words[i] = prefix
break
if ipa:
words[i] = f"{w} [{ipa}]"
# Strip garbled OCR phonetics after the IPA bracket.
# On scanned vocab pages, printed IPA is read as garbled
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
# After inserting correct IPA, remove remaining words that
# aren't real English words, delimiters, or German text.
kept = words[:i + 1]
for j in range(i + 1, len(words)):
wj = words[j]
# Delimiter — keep this and everything after
if wj in ('', '', '-', '/', '|', ',', ';'):
kept.extend(words[j:])
break
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
if re.match(r'^[\d.)\-]+$', wj):
kept.extend(words[j:])
break
# Starts with uppercase — likely German or proper noun
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
kept.extend(words[j:])
break
# Known English word (≥2 chars) — keep it and rest
if clean_j and len(clean_j) >= 2:
if _lookup_ipa(clean_j, pronunciation):
kept.extend(words[j:])
break
# Merged token: dictionary word + garbled IPA stuck together.
# E.g. "fictionsalans'fIkfn" starts with "fiction".
# Extract the dictionary prefix (≥4 chars) and add it with
# IPA, but only if enough chars remain after the prefix (≥3)
# to look like garbled IPA, not just a plural 's'.
if clean_j and len(clean_j) >= 7:
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
prefix_j = clean_j[:pend]
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
if prefix_ipa:
kept.append(f"{prefix_j} [{prefix_ipa}]")
break
break # rest of this token is garbled
# Otherwise — likely garbled phonetics, skip
words = kept
break
return ' '.join(words)

View File

@@ -0,0 +1,287 @@
"""
Advanced IPA repair for OCR-extracted vocabulary.
Functions that detect and fix garbled IPA fragments trailing after
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
to stay within the 500 LOC budget.
Contains:
- _has_non_dict_trailing: detect non-dictionary trailing words
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
"""
import logging
import re
from typing import Any, Dict, List, Optional
from cv_vocab_types import IPA_AVAILABLE
from cv_ocr_ipa_lookup import (
_lookup_ipa,
_GRAMMAR_BRACKET_WORDS,
)
logger = logging.getLogger(__name__)
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
"""Check if text has a headword followed by non-dictionary trailing words.
Used as an additional trigger for ``_insert_missing_ipa`` when
``_text_has_garbled_ipa`` returns False because the garbled IPA
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
"""
if not IPA_AVAILABLE:
return False
words = text.strip().split()
if len(words) < 2 or len(words) > 6:
return False
# Find first dictionary word
hw_idx = -1
for i, w in enumerate(words):
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
if _lookup_ipa(clean, pronunciation):
hw_idx = i
break
if hw_idx < 0 or hw_idx >= len(words) - 1:
return False
# Check ALL remaining words — if none are dictionary/delimiter/German,
# they are likely garbled IPA.
for j in range(hw_idx + 1, len(words)):
wj = words[j]
if wj in ('', '', '-', '/', '|', ',', ';'):
return False
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
if re.match(r'^[\d.)\-]+$', wj):
return False
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
if clean_j and clean_j[0].isupper():
return False
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
return False
return True
def _strip_post_bracket_garbled(
text: str, pronunciation: str = 'british',
) -> str:
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
For multi-word headwords like "seat belt", a real English word ("belt")
may be followed by garbled IPA duplicates. We detect this by checking
whether the sequence after a real word contains IPA markers (`:`, `ə`,
etc.) — if so, everything from the first garbled token onward is stripped.
"""
if ']' not in text:
return text
last_bracket = text.rfind(']')
if last_bracket >= len(text) - 1:
return text
before = text[:last_bracket + 1].rstrip()
after = text[last_bracket + 1:].strip()
if not after:
return text
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
after_words = after.split()
kept: List[str] = []
for idx, w in enumerate(after_words):
# Delimiter — keep rest
if w in ('', '', '-', '/', '|', ',', ';'):
kept.extend(after_words[idx:])
break
# Contains IPA markers (length mark, IPA chars) — garbled, skip
if any(c in w for c in _IPA_MARKER_CHARS):
# Everything from here is garbled IPA — stop scanning
# but look ahead: if any remaining words are real English
# words WITHOUT IPA markers, they might be a different headword
# following. Only skip the contiguous garbled run.
continue
clean = re.sub(r'[^a-zA-Z]', '', w)
# Uppercase — likely German, keep rest
if clean and clean[0].isupper():
kept.extend(after_words[idx:])
break
# Known English word — keep it, but check if followed by garbled IPA
# (multi-word headword case like "seat [siːt] belt si:t belt")
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
# Peek ahead: if next word has IPA markers, the rest is garbled
remaining = after_words[idx + 1:]
has_garbled_after = any(
any(c in rw for c in _IPA_MARKER_CHARS)
for rw in remaining
)
if has_garbled_after:
# Keep this real word but stop — rest is garbled duplication
kept.append(w)
# Still scan for delimiters/German in the remaining words
for ridx, rw in enumerate(remaining):
if rw in ('', '', '-', '/', '|', ',', ';'):
kept.extend(remaining[ridx:])
break
rclean = re.sub(r'[^a-zA-Z]', '', rw)
if rclean and rclean[0].isupper():
kept.extend(remaining[ridx:])
break
break
else:
kept.extend(after_words[idx:])
break
# Unknown short word — likely garbled, skip
if kept:
return before + ' ' + ' '.join(kept)
return before
def fix_ipa_continuation_cell(
garbled_text: str,
headword_text: str,
pronunciation: str = 'british',
) -> str:
"""Replace garbled IPA in a continuation row with proper IPA.
Continuation rows appear below the headword and contain only the
printed phonetic transcription, which OCR garbles into fragments
like ``ska:f ska:vz`` (should be ``[skˈɑːf] [skˈɑːvz]``).
Args:
garbled_text: The OCR-garbled IPA text from the continuation row.
headword_text: The headword text from the previous row
(e.g. ``scarf scarves``).
pronunciation: ``'british'`` or ``'american'``.
Returns:
Corrected IPA text, or the original if no fix could be applied.
"""
if not IPA_AVAILABLE or not garbled_text or not headword_text:
return garbled_text
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
# only generate continuation IPA for words NOT already covered.
covered_words: set = set()
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
if has_inline_ipa:
# Words before the first bracket already have their IPA shown
first_bracket = headword_text.index('[')
pre_bracket = headword_text[:first_bracket].strip()
for w in pre_bracket.split():
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
if clean and len(clean) >= 2:
covered_words.add(clean)
last_bracket_end = headword_text.rfind(']')
tail = headword_text[last_bracket_end + 1:].strip()
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
# — return the inline IPA directly (continuation duplicates it)
last_bracket_start = headword_text.rfind('[')
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
return inline_ipa
# Only the tail words need continuation IPA
headword_text = tail
# Strip existing IPA brackets and parenthetical grammar annotations
# like "(no pl)", "(sth)", "(sb)" from headword text
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
if not clean_hw:
return garbled_text
# Split headword by delimiters ( — -)
# "scarf scarves" → ["scarf", "scarves"]
# "see - saw - seen" → ["see", "saw", "seen"]
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
parts = [p.strip() for p in parts if p.strip()]
if not parts:
return garbled_text
# Look up IPA for each headword part.
# Skip articles (the, a, an) — they never get IPA in vocab books.
# Other function words like "down", "up" are kept because they are
# integral parts of phrasal verbs (e.g. "close down").
# Skip words that already have inline IPA in the headword row.
_ARTICLES = {'the', 'a', 'an'}
ipa_parts: List[str] = []
for part in parts:
# A part may be multi-word like "secondary school"
words = part.split()
word_ipas: List[str] = []
for w in words:
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
if not clean_w or len(clean_w) < 2:
continue
if covered_words and clean_w.lower() in covered_words:
continue # Already has IPA inline in the headword
if clean_w.lower() in _ARTICLES:
continue # Articles never get IPA in vocab books
ipa = _lookup_ipa(clean_w, pronunciation)
if ipa:
word_ipas.append(ipa)
if word_ipas:
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
if not ipa_parts:
return garbled_text
# Join with delimiter
result = ' '.join(ipa_parts)
logger.debug(
"fix_ipa_continuation: '%s''%s' (headwords: '%s')",
garbled_text, result, headword_text,
)
return result
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
"""Insert IPA for the first English headword in a long mixed-language line.
Unlike _insert_missing_ipa (for short column_en cells), this handles
column_text lines of any length. It only inserts IPA for the FIRST word
if that word:
- has no bracket following it already
- has an IPA entry in the dictionary
- is not a number/symbol prefix like "».55"
Returns the text with [ipa] inserted after the first word, or unchanged.
"""
if not IPA_AVAILABLE:
return text
if not text or not text.strip():
return text
words = text.strip().split()
if not words:
return text
# Check if text already starts with a bracket (IPA already present)
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
return text
# Try the first few words (skip numeric prefixes like "».55", "0.56")
for i in range(min(3, len(words))):
w = words[i]
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
if not clean or len(clean) < 2:
continue
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
continue
ipa = _lookup_ipa(clean, pronunciation)
if ipa:
words[i] = f"{w} [{ipa}]"
return ' '.join(words)
# Stop at first real word even if no IPA found
break
return text

View File

@@ -0,0 +1,318 @@
"""
Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
- Character confusion fix (I/1/l/|)
- Comma-separated word form splitting
- Example sentence attachment to matching vocab entries
Split from cv_ocr_engines.py for maintainability.
"""
import re
from typing import Any, Dict, List
# =============================================================================
# Post-Processing: Deterministic Quality Fixes
# =============================================================================
# --- A. Character Confusion Fix (I/1/l) ---
# Common OCR confusion pairs in vocabulary context
_CHAR_CONFUSION_RULES = [
# "1" at word start followed by lowercase → likely "I" or "l"
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
]
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Fix common OCR character confusions using context.
Deterministic rules:
- "1" at word start → "I" or "l" based on context
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1""I"
- "y " artifact at word boundaries → remove (e.g. "y you""you")
"""
for entry in entries:
en = entry.get('english', '') or ''
de = entry.get('german', '') or ''
ex = entry.get('example', '') or ''
# Apply general rules to all fields
for pattern, replacement in _CHAR_CONFUSION_RULES:
en = pattern.sub(replacement, en)
de = pattern.sub(replacement, de)
ex = pattern.sub(replacement, ex)
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
de_lower_words = set(de.lower().replace(',', ' ').split())
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
# Any remaining "1" in EN that looks like "I"
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
# Fix "y " artifact before repeated word: "y you" → "you"
en = re.sub(r'\by\s+([a-z])', r'\1', en)
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
entry['english'] = en.strip()
entry['german'] = de.strip()
entry['example'] = ex.strip()
return entries
# --- B. Comma-Separated Word Form Splitting ---
def _is_singular_plural_pair(parts: List[str]) -> bool:
"""Detect if comma-separated parts are singular/plural forms of the same word.
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
"break, broke, broken" → False (different verb forms, OK to split).
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
"""
if len(parts) != 2:
return False
a, b = parts[0].lower().strip(), parts[1].lower().strip()
if not a or not b:
return False
# Common prefix heuristic: if words share >= 50% of the shorter word,
# they are likely forms of the same word (Maus/Mäuse, child/children).
min_len = min(len(a), len(b))
common = 0
for ca, cb in zip(a, b):
if ca == cb:
common += 1
else:
break
if common >= max(2, min_len * 0.5):
return True
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
umlaut_map = str.maketrans('aou', 'äöü')
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
return True
return False
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Split entries with comma-separated word forms into individual entries.
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
because those are forms of the same vocabulary entry.
Only splits when both EN and DE have the same number of comma-parts,
parts are short (word forms, not sentences), and at least 3 parts
(to avoid splitting pairs that likely belong together).
"""
result: List[Dict[str, Any]] = []
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
# Split by comma (but not inside brackets or parentheses)
en_parts = _split_by_comma(en)
de_parts = _split_by_comma(de)
# Only split if we have multiple parts and counts match
should_split = False
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
# All parts must be short (word forms, not sentences)
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
# Do NOT split singular/plural pairs (2 parts that are
# forms of the same word)
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
should_split = False
else:
should_split = True
if not should_split:
result.append(entry)
continue
# Split into individual entries
for k in range(len(en_parts)):
sub = dict(entry) # shallow copy
sub['english'] = en_parts[k].strip()
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
sub['example'] = '' # examples get attached later
sub['split_from_comma'] = True
result.append(sub)
# Re-number
for i, e in enumerate(result):
e['row_index'] = i
return result
def _split_by_comma(text: str) -> List[str]:
"""Split text by commas, but not inside brackets [...] or parens (...)."""
if ',' not in text:
return [text]
parts = []
depth_bracket = 0
depth_paren = 0
current = []
for ch in text:
if ch == '[':
depth_bracket += 1
elif ch == ']':
depth_bracket = max(0, depth_bracket - 1)
elif ch == '(':
depth_paren += 1
elif ch == ')':
depth_paren = max(0, depth_paren - 1)
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
parts.append(''.join(current).strip())
current = []
continue
current.append(ch)
if current:
parts.append(''.join(current).strip())
# Filter empty parts
return [p for p in parts if p]
# --- C. Example Sentence Attachment ---
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
"""Find the vocab entry whose English word(s) best match the example sentence.
Returns index into vocab_entries, or -1 if no match found.
Uses word stem overlap: "a broken arm" matches "broken" or "break".
"""
if not vocab_entries or not example_text:
return -1
example_lower = example_text.lower()
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
best_idx = -1
best_score = 0
for i, entry in enumerate(vocab_entries):
en = (entry.get('english', '') or '').lower()
if not en:
continue
# Extract vocab words (split on space, comma, newline)
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
# Score: how many vocab words appear in the example?
# Also check if example words share a common stem (first 4 chars)
direct_matches = vocab_words & example_words
score = len(direct_matches) * 10
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
if score == 0:
for vw in vocab_words:
if len(vw) < 3:
continue
stem = vw[:4] if len(vw) >= 4 else vw[:3]
for ew in example_words:
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
score += 5
break
if score > best_score:
best_score = score
best_idx = i
return best_idx if best_score > 0 else -1
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
Vocabulary worksheets often have:
Row 1: break, broke, broken / brechen, brach, gebrochen
Row 2: a broken arm (no DE → example for "broken")
Row 3: a broken plate (no DE → example for "broken")
Row 4: egg / Ei (has DE → new vocab entry)
Rules (deterministic, generic):
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
- Find the best matching vocab entry by checking which entry's English words
appear in the example sentence (semantic matching via word overlap)
- Fall back to the nearest preceding entry if no word match found
- Multiple examples get joined with " | "
"""
if not entries:
return entries
# Separate into vocab entries (have DE) and example candidates (no DE)
vocab_entries: List[Dict[str, Any]] = []
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
for entry in entries:
en = (entry.get('english', '') or '').strip()
de = (entry.get('german', '') or '').strip()
ex = (entry.get('example', '') or '').strip()
# Treat single-char DE as OCR noise, not real translation.
# "Ei" (2 chars) is a valid German word, so threshold is 1.
has_de = len(de) > 1
has_en = bool(en)
# Heuristic: a row without DE is an "example sentence" only if
# the EN text looks like a sentence (>= 4 words, or contains
# typical sentence punctuation). Short EN text (1-3 words) is
# more likely a vocab entry whose DE was missed by OCR.
_looks_like_sentence = (
len(en.split()) >= 4
or en.rstrip().endswith(('.', '!', '?'))
)
is_example_candidate = (
has_en and not has_de and _looks_like_sentence and vocab_entries
)
if is_example_candidate:
# This is an example sentence — find best matching vocab entry
example_text = en
match_idx = _find_best_vocab_match(en, vocab_entries)
if match_idx < 0:
# No word match → fall back to last entry
match_idx = len(vocab_entries) - 1
if match_idx not in examples_for:
examples_for[match_idx] = []
examples_for[match_idx].append(example_text)
else:
vocab_entries.append(entry)
# Attach examples to their matched vocab entries
for idx, example_list in examples_for.items():
if 0 <= idx < len(vocab_entries):
entry = vocab_entries[idx]
existing_ex = (entry.get('example', '') or '').strip()
new_examples = ' | '.join(example_list)
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
# Re-number
for i, e in enumerate(vocab_entries):
e['row_index'] = i
return vocab_entries

View File

@@ -0,0 +1,134 @@
"""
Word assembly helpers for OCR output.
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
into visual lines, rejoins hyphenated words, and produces reading-order
text. All functions are pure standard-library; no NumPy or project
imports required.
"""
import logging
from typing import Dict, List
logger = logging.getLogger(__name__)
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
"""Group words by Y position into lines, sorted by X within each line."""
if not words:
return []
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
lines: List[List[Dict]] = []
current_line: List[Dict] = [sorted_words[0]]
current_y = sorted_words[0]['top']
for word in sorted_words[1:]:
if abs(word['top'] - current_y) <= y_tolerance_px:
current_line.append(word)
else:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
current_line = [word]
current_y = word['top']
if current_line:
current_line.sort(key=lambda w: w['left'])
lines.append(current_line)
return lines
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
"""Group OCR words into visual lines in reading order.
Returns a list of line strings (one per visual line in the cell).
"""
if not words:
return []
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
return [' '.join(w['text'] for w in line) for line in lines]
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
"""Rejoin words split by line-break hyphenation.
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
['some text-', 'thing here'] \u2192 ['something here']
"""
if len(lines) <= 1:
return lines
result = []
i = 0
while i < len(lines):
line = lines[i]
# If line ends with '-' and there's a next line, rejoin
if i + 1 < len(lines) and line.rstrip().endswith('-'):
stripped = line.rstrip()
# Get the word fragment before hyphen (last word)
prefix = stripped[:-1] # remove trailing hyphen
next_line = lines[i + 1]
# Join: last word of this line + first word of next line
prefix_words = prefix.rsplit(' ', 1)
next_words = next_line.split(' ', 1)
if len(prefix_words) > 1:
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
else:
joined = prefix_words[0] + next_words[0]
remainder = next_words[1] if len(next_words) > 1 else ''
if remainder:
result.append(joined + ' ' + remainder)
else:
result.append(joined)
i += 2
else:
result.append(line)
i += 1
return result
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words into text in correct reading order, preserving line breaks.
Groups words into visual lines by Y-tolerance, sorts each line by X,
rejoins hyphenated words, then joins lines with newlines.
"""
lines = _words_to_reading_order_lines(words, y_tolerance_px)
lines = _rejoin_hyphenated(lines)
return '\n'.join(lines)
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
"""Join OCR words preserving proportional horizontal spacing.
Instead of single spaces between words, inserts multiple spaces based on
the pixel gap between words relative to average character width.
Useful for box sub-sessions where spatial layout matters.
"""
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
result_lines = []
for line_words in lines:
if not line_words:
continue
sorted_words = sorted(line_words, key=lambda w: w['left'])
# Calculate average character width from all words in line
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
avg_char_width = total_width / total_chars if total_chars > 0 else 10
parts = []
for i, word in enumerate(sorted_words):
parts.append(word.get('text', ''))
if i < len(sorted_words) - 1:
next_word = sorted_words[i + 1]
gap_px = next_word['left'] - (word['left'] + word['width'])
num_spaces = max(1, round(gap_px / avg_char_width))
parts.append(' ' * num_spaces)
result_lines.append(''.join(parts))
return '\n'.join(result_lines)

View File

@@ -0,0 +1,140 @@
"""
DSFA Chunking — Text chunking strategies for document ingestion.
Contains:
- chunk_text_recursive: Recursive chunking with overlap
- chunk_by_sections: Section-marker-based chunking
- chunk_by_list_items: List-item-based chunking
- chunk_document: Strategy router
"""
import re
from typing import List, Dict
from dsfa_sources_registry import DSFA_CHUNK_CONFIG
def chunk_text_recursive(text: str, max_size: int = 1000, overlap: int = 200) -> List[Dict]:
"""Recursively chunk text with overlap."""
chunks = []
start = 0
while start < len(text):
end = min(start + max_size, len(text))
# Find a good break point (sentence end, paragraph)
if end < len(text):
for sep in ["\n\n", "\n", ". ", ", ", " "]:
last_sep = text[start:end].rfind(sep)
if last_sep > max_size // 2:
end = start + last_sep + len(sep)
break
chunk_text = text[start:end].strip()
if chunk_text:
chunks.append({
"content": chunk_text,
"start_char": start,
"end_char": end
})
start = end - overlap if end < len(text) else len(text)
return chunks
def chunk_by_sections(text: str, markers: List[str], max_size: int = 1500, overlap: int = 200) -> List[Dict]:
"""Chunk text by section markers."""
chunks = []
pattern = "|".join(f"({m})" for m in markers)
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
if not matches:
return chunk_text_recursive(text, max_size, overlap)
for i, match in enumerate(matches):
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
section_text = text[start:end].strip()
section_title = match.group(0).strip()
if len(section_text) > max_size:
sub_chunks = chunk_text_recursive(section_text, max_size, overlap)
for j, sub in enumerate(sub_chunks):
chunks.append({
"content": sub["content"],
"section_title": section_title if j == 0 else f"{section_title} (cont.)",
"start_char": start + sub["start_char"],
"end_char": start + sub["end_char"]
})
else:
chunks.append({
"content": section_text,
"section_title": section_title,
"start_char": start,
"end_char": end
})
return chunks
def chunk_by_list_items(text: str, markers: List[str], max_size: int = 800) -> List[Dict]:
"""Chunk text by list item markers."""
chunks = []
pattern = "|".join(f"({m})" for m in markers)
lines = text.split("\n")
current_item = ""
current_start = 0
for i, line in enumerate(lines):
if re.match(pattern, line.strip()):
if current_item.strip():
chunks.append({
"content": current_item.strip(),
"start_char": current_start,
"end_char": current_start + len(current_item)
})
current_item = line
current_start = sum(len(lines[j]) + 1 for j in range(i))
else:
current_item += "\n" + line
if current_item.strip():
chunks.append({
"content": current_item.strip(),
"start_char": current_start,
"end_char": current_start + len(current_item)
})
return chunks
def chunk_document(text: str, source_code: str) -> List[Dict]:
"""Chunk document using appropriate strategy for source type."""
config = DSFA_CHUNK_CONFIG.get(source_code, DSFA_CHUNK_CONFIG["DEFAULT"])
if source_code.endswith("_MUSS_PUBLIC") or source_code.endswith("_MUSS_PRIVATE"):
config = DSFA_CHUNK_CONFIG["MUSS_LISTEN"]
if config["strategy"] == "section_based":
return chunk_by_sections(
text,
config["section_markers"],
config["max_chunk_size"],
config["overlap"]
)
elif config["strategy"] == "list_item":
return chunk_by_list_items(
text,
config["list_markers"],
config["max_chunk_size"]
)
else:
return chunk_text_recursive(
text,
config["max_chunk_size"],
config["overlap"]
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,239 @@
"""
DSFA Corpus Store — Database operations and data classes.
Contains:
- DSFAChunkPayload: Qdrant point payload schema
- DSFASearchResult: Search result with attribution
- DSFACorpusStore: PostgreSQL operations for DSFA corpus
"""
import hashlib
import uuid
from typing import List, Dict, Optional
from dataclasses import dataclass
import asyncpg
from dsfa_sources_registry import LICENSE_REGISTRY
@dataclass
class DSFAChunkPayload:
"""Payload schema for Qdrant points."""
chunk_id: str
document_id: str
source_id: str
content: str
section_title: Optional[str] = None
source_code: str = ""
source_name: str = ""
attribution_text: str = ""
license_code: str = ""
attribution_required: bool = True
document_type: str = ""
category: str = ""
language: str = "de"
page_number: Optional[int] = None
@dataclass
class DSFASearchResult:
"""Search result with attribution."""
chunk_id: str
content: str
score: float
source_code: str
source_name: str
attribution_text: str
license_code: str
license_url: Optional[str]
attribution_required: bool
source_url: Optional[str]
document_type: str
category: str
section_title: Optional[str]
page_number: Optional[int]
class DSFACorpusStore:
"""Database operations for DSFA corpus."""
def __init__(self, pool: asyncpg.Pool):
self.pool = pool
async def register_source(self, source_data: Dict) -> str:
"""Register a DSFA source in the database."""
async with self.pool.acquire() as conn:
existing = await conn.fetchval(
"SELECT id FROM dsfa_sources WHERE source_code = $1",
source_data["source_code"]
)
if existing:
await conn.execute("""
UPDATE dsfa_sources SET
name = $2,
full_name = $3,
organization = $4,
source_url = $5,
eur_lex_celex = $6,
license_code = $7,
license_url = $8,
attribution_required = $9,
attribution_text = $10,
document_type = $11,
language = $12,
updated_at = NOW()
WHERE source_code = $1
""",
source_data["source_code"],
source_data["name"],
source_data.get("full_name"),
source_data.get("organization"),
source_data.get("source_url"),
source_data.get("eur_lex_celex"),
source_data["license_code"],
source_data.get("license_url"),
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
source_data["attribution_text"],
source_data.get("document_type"),
source_data.get("language", "de")
)
return str(existing)
else:
source_id = await conn.fetchval("""
INSERT INTO dsfa_sources (
source_code, name, full_name, organization, source_url,
eur_lex_celex, license_code, license_url, attribution_required,
attribution_text, document_type, language
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
RETURNING id
""",
source_data["source_code"],
source_data["name"],
source_data.get("full_name"),
source_data.get("organization"),
source_data.get("source_url"),
source_data.get("eur_lex_celex"),
source_data["license_code"],
source_data.get("license_url"),
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
source_data["attribution_text"],
source_data.get("document_type"),
source_data.get("language", "de")
)
return str(source_id)
async def get_source_by_code(self, source_code: str) -> Optional[Dict]:
"""Get source by its code."""
async with self.pool.acquire() as conn:
row = await conn.fetchrow(
"SELECT * FROM dsfa_sources WHERE source_code = $1",
source_code
)
if row:
return dict(row)
return None
async def list_sources(self) -> List[Dict]:
"""List all registered sources."""
async with self.pool.acquire() as conn:
rows = await conn.fetch(
"SELECT * FROM dsfa_sources ORDER BY source_code"
)
return [dict(row) for row in rows]
async def create_document(
self,
source_id: str,
title: str,
file_name: Optional[str] = None,
file_type: Optional[str] = None,
minio_path: Optional[str] = None,
original_url: Optional[str] = None,
metadata: Optional[Dict] = None
) -> str:
"""Create a document record."""
import json
metadata_json = json.dumps(metadata or {})
async with self.pool.acquire() as conn:
doc_id = await conn.fetchval("""
INSERT INTO dsfa_documents (
source_id, title, file_name, file_type, minio_path,
original_url, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb)
RETURNING id
""",
uuid.UUID(source_id),
title,
file_name,
file_type,
minio_path,
original_url,
metadata_json
)
return str(doc_id)
async def create_chunk(
self,
document_id: str,
source_id: str,
content: str,
chunk_index: int,
section_title: Optional[str] = None,
page_number: Optional[int] = None,
category: Optional[str] = None,
qdrant_point_id: Optional[str] = None,
metadata: Optional[Dict] = None
) -> str:
"""Create a chunk record."""
import json
content_hash = hashlib.sha256(content.encode()).hexdigest()
async with self.pool.acquire() as conn:
chunk_id = await conn.fetchval("""
INSERT INTO dsfa_document_chunks (
document_id, source_id, content, content_hash, chunk_index,
section_title, page_number, category, qdrant_point_id, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb)
RETURNING id
""",
uuid.UUID(document_id),
uuid.UUID(source_id),
content,
content_hash,
chunk_index,
section_title,
page_number,
category,
qdrant_point_id,
json.dumps(metadata or {})
)
return str(chunk_id)
async def get_chunk_with_attribution(self, chunk_id: str) -> Optional[Dict]:
"""Get a chunk with full source attribution."""
async with self.pool.acquire() as conn:
row = await conn.fetchrow("""
SELECT * FROM dsfa_chunk_with_attribution
WHERE chunk_id = $1
""", uuid.UUID(chunk_id))
if row:
return dict(row)
return None
async def get_source_stats(self) -> List[Dict]:
"""Get aggregated stats per source."""
async with self.pool.acquire() as conn:
rows = await conn.fetch("SELECT * FROM dsfa_source_stats")
return [dict(row) for row in rows]
async def update_document_indexed(self, document_id: str, chunks_count: int):
"""Update document with indexing information."""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE dsfa_documents
SET chunks_generated = $2,
last_indexed_at = NOW(),
text_extracted = true
WHERE id = $1
""", uuid.UUID(document_id), chunks_count)

View File

@@ -0,0 +1,157 @@
"""
DSFA Qdrant Service — Vector store operations.
Contains:
- DSFAQdrantService: Qdrant client wrapper for DSFA corpus
"""
import os
import uuid
from typing import List, Dict, Optional
from dataclasses import asdict
from qdrant_client import QdrantClient
from qdrant_client.models import (
VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue
)
from dsfa_corpus_store import DSFAChunkPayload
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
DSFA_COLLECTION = "bp_dsfa_corpus"
VECTOR_SIZE = 1024 # BGE-M3
class DSFAQdrantService:
"""Qdrant operations for DSFA corpus."""
def __init__(self, url: Optional[str] = None):
self.url = url or QDRANT_URL
self._client = None
@property
def client(self) -> QdrantClient:
if self._client is None:
self._client = QdrantClient(url=self.url, check_compatibility=False)
return self._client
async def ensure_collection(self) -> bool:
"""Ensure DSFA collection exists."""
try:
collections = self.client.get_collections().collections
collection_names = [c.name for c in collections]
if DSFA_COLLECTION not in collection_names:
self.client.create_collection(
collection_name=DSFA_COLLECTION,
vectors_config=VectorParams(
size=VECTOR_SIZE,
distance=Distance.COSINE
)
)
print(f"Created collection: {DSFA_COLLECTION}")
return True
except Exception as e:
print(f"Error ensuring collection: {e}")
return False
async def index_chunks(
self,
chunks: List[Dict],
embeddings: List[List[float]]
) -> int:
"""Index chunks into Qdrant."""
if not chunks or not embeddings:
return 0
points = []
for chunk, embedding in zip(chunks, embeddings):
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk["chunk_id"]))
payload = DSFAChunkPayload(
chunk_id=chunk["chunk_id"],
document_id=chunk["document_id"],
source_id=chunk["source_id"],
content=chunk["content"],
section_title=chunk.get("section_title"),
source_code=chunk["source_code"],
source_name=chunk["source_name"],
attribution_text=chunk["attribution_text"],
license_code=chunk["license_code"],
attribution_required=chunk.get("attribution_required", True),
document_type=chunk.get("document_type", ""),
category=chunk.get("category", ""),
language=chunk.get("language", "de"),
page_number=chunk.get("page_number")
)
points.append(
PointStruct(
id=point_id,
vector=embedding,
payload=asdict(payload)
)
)
self.client.upsert(collection_name=DSFA_COLLECTION, points=points)
return len(points)
async def search(
self,
query_embedding: List[float],
source_codes: Optional[List[str]] = None,
document_types: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
limit: int = 10
) -> List[Dict]:
"""Search DSFA corpus with filters."""
must_conditions = []
if source_codes:
for code in source_codes:
must_conditions.append(
FieldCondition(key="source_code", match=MatchValue(value=code))
)
if document_types:
for dtype in document_types:
must_conditions.append(
FieldCondition(key="document_type", match=MatchValue(value=dtype))
)
if categories:
for cat in categories:
must_conditions.append(
FieldCondition(key="category", match=MatchValue(value=cat))
)
query_filter = Filter(must=must_conditions) if must_conditions else None
results = self.client.query_points(
collection_name=DSFA_COLLECTION,
query=query_embedding,
query_filter=query_filter,
limit=limit
)
return [
{
"id": str(r.id),
"score": r.score,
**r.payload
}
for r in results.points
]
async def get_stats(self) -> Dict:
"""Get collection statistics."""
try:
info = self.client.get_collection(DSFA_COLLECTION)
return {
"collection": DSFA_COLLECTION,
"vectors_count": info.vectors_count,
"points_count": info.points_count,
"status": info.status.value
}
except Exception as e:
return {"error": str(e), "collection": DSFA_COLLECTION}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,305 @@
"""
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
garbled cell cleanup, word-box reordering, and max_columns enforcement.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Tuple
from cv_ocr_engines import (
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
)
logger = logging.getLogger(__name__)
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
"""Remove blue bullet/artifact word_boxes (Step 5i).
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
and syllable-split word merging.
"""
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
bullet_removed = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
to_remove: set = set()
# Rule (a): tiny coloured symbols
for i, wb in enumerate(wbs):
cn = wb.get("color_name", "black")
if (cn != "black"
and wb.get("width", 0) * wb.get("height", 0) < 200
and wb.get("conf", 100) < 85):
to_remove.add(i)
# Rule (a2): isolated non-alphanumeric symbols
for i, wb in enumerate(wbs):
t = (wb.get("text") or "").strip()
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
if t in _REMOVE_SYMBOLS:
to_remove.add(i)
# Rule (b) + (c): overlap and duplicate detection
to_merge: List[Tuple[int, int]] = []
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
for p in range(len(indexed) - 1):
i1, w1 = indexed[p]
i2, w2 = indexed[p + 1]
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
min_w = min(w1.get("width", 1), w2.get("width", 1))
gap = x2s - x1e
overlap_pct = overlap / min_w if min_w > 0 else 0
if overlap_pct > 0.20:
t1 = (w1.get("text") or "").strip()
t2 = (w2.get("text") or "").strip()
# Syllable-split words
if (overlap_pct <= 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)):
to_merge.append((i1, i2))
continue
# High overlap with short prefix
if (overlap_pct > 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40:
continue
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
# Very high overlap: prefer IPA-dictionary word
if overlap_pct > 0.90 and t1.lower() != t2.lower():
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
if in_dict_1 and not in_dict_2:
to_remove.add(i2)
continue
elif in_dict_2 and not in_dict_1:
to_remove.add(i1)
continue
if c1 < c2:
to_remove.add(i1)
elif c2 < c1:
to_remove.add(i2)
else:
if w1.get("height", 0) > w2.get("height", 0):
to_remove.add(i1)
else:
to_remove.add(i2)
elif (gap < 6
and w1.get("color_name") == "blue"
and w2.get("color_name") == "blue"
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words)
if to_merge:
merge_parent: Dict[int, int] = {}
for mi1, mi2 in to_merge:
actual_mi1 = mi1
while actual_mi1 in merge_parent:
actual_mi1 = merge_parent[actual_mi1]
if actual_mi1 in to_remove or mi2 in to_remove:
continue
if mi2 in merge_parent:
continue
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip()
merged_text = mt1 + mt2
mx = min(mw1["left"], mw2["left"])
my = min(mw1["top"], mw2["top"])
mr = max(mw1["left"] + mw1["width"],
mw2["left"] + mw2["width"])
mb = max(mw1["top"] + mw1["height"],
mw2["top"] + mw2["height"])
mw1["text"] = merged_text
mw1["left"] = mx
mw1["top"] = my
mw1["width"] = mr - mx
mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2)
merge_parent[mi2] = actual_mi1
bullet_removed -= 1
if to_remove:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
cell["word_boxes"] = filtered
if not cell.get("_ipa_corrected"):
cell["text"] = _words_to_reading_order_text(filtered)
if bullet_removed:
for z in zones_data:
z["cells"] = [c for c in z.get("cells", [])
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
_COMMON_SHORT_WORDS = {
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
"die", "der", "das", "dem", "den", "des", "ein", "und",
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
"on", "or", "so", "to", "up", "us", "we",
"the", "and", "but", "for", "not",
}
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
artifact_cells_removed = 0
for z in zones_data:
before = len(z.get("cells", []))
kept = []
for cell in z.get("cells", []):
text = (cell.get("text") or "").strip()
core = text.rstrip(".,;:!?'\"")
is_artifact = False
if not core:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '', ''):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
and not re.match(r'^[pPsS]\.?\d+$', core)):
is_artifact = True
if is_artifact:
kept.append(None)
else:
kept.append(cell)
z["cells"] = [c for c in kept if c is not None]
artifact_cells_removed += before - len(z["cells"])
if artifact_cells_removed:
for z in zones_data:
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
"""Normalise word_box order to reading order (Step 5j)."""
wb_reordered = 0
for z in zones_data:
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
if len(wbs) < 2:
continue
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
sorted_wbs = [w for line in lines for w in line]
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
cell["word_boxes"] = sorted_wbs
wb_reordered += 1
if wb_reordered:
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
def _enforce_max_columns(
zones_data: List[Dict[str, Any]],
max_columns: int,
) -> None:
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
for z in zones_data:
if z.get("zone_type") != "content":
continue
cols = z.get("columns", [])
cells = z.get("cells", [])
if len(cols) <= max_columns:
continue
logger.info(
"max_columns=%d: zone %s has %d columns -> merging",
max_columns, z.get("zone_index"), len(cols),
)
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
while len(cols) > max_columns:
narrowest = cols_by_width.pop(0)
ni = narrowest["index"]
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
if pos + 1 < len(sorted_by_x):
merge_target = sorted_by_x[pos + 1]
elif pos > 0:
merge_target = sorted_by_x[pos - 1]
else:
break
ti = merge_target["index"]
merge_target["x_min_px"] = min(
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
)
merge_target["x_max_px"] = max(
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
)
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
for cell in cells:
if cell.get("col_index") == ni:
cell["col_index"] = ti
existing = next(
(c for c in cells if c["col_index"] == ti
and c["row_index"] == cell["row_index"]
and c is not cell),
None,
)
if existing:
existing["text"] = (
(existing.get("text", "") + " " + cell.get("text", "")).strip()
)
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
cell["_merged"] = True
z["cells"] = [c for c in cells if not c.get("_merged")]
cells = z["cells"]
cols.remove(narrowest)
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
# Re-index columns 0..N-1
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
old_idx = col["index"]
col["index"] = new_idx
for cell in cells:
if cell.get("col_index") == old_idx:
cell["col_index"] = new_idx
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))

View File

@@ -0,0 +1,390 @@
"""
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
divider removal, connector normalization, border strip detection, and
alphabet sidebar removal.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List
from cv_ocr_engines import _words_to_reading_order_text
logger = logging.getLogger(__name__)
_PIPE_RE = re.compile(r"^\|+$")
def _cleanup_zones(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
session_id: str,
) -> bool:
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
Args:
zones_data: List of zone dicts (modified in place).
border_prefiltered: Whether border words were already pre-filtered.
session_id: For logging.
Returns:
Updated border_prefiltered flag.
"""
_remove_junk_rows(zones_data)
_remove_artifact_cells(zones_data)
_remove_oversized_word_boxes(zones_data)
_remove_pipe_dividers(zones_data)
_normalize_connector_columns(zones_data)
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
_remove_alphabet_sidebars(zones_data)
return border_prefiltered
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
"""Remove rows where ALL cells contain only short, low-confidence text.
Also removes 'oversized stub' rows and 'scattered debris' rows.
"""
_JUNK_CONF_THRESHOLD = 50
_JUNK_MAX_TEXT_LEN = 3
for z in zones_data:
cells = z.get("cells", [])
rows = z.get("rows", [])
if not cells or not rows:
continue
# Compute median word height across the zone for oversized detection
all_wb_heights = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
junk_row_indices = set()
for row in rows:
ri = row["index"]
row_cells = [c for c in cells if c.get("row_index") == ri]
if not row_cells:
continue
row_wbs = [
wb for cell in row_cells
for wb in cell.get("word_boxes") or []
]
# Rule 1: ALL word_boxes are low-conf AND short text
all_junk = True
for wb in row_wbs:
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 0)
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
all_junk = False
break
if all_junk and row_wbs:
junk_row_indices.add(ri)
continue
# Rule 2: oversized stub -- <=3 words, short total text,
# and word height > 1.8x median
if len(row_wbs) <= 3:
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
has_page_ref = any(
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
for wb in row_wbs
)
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
junk_row_indices.add(ri)
continue
# Rule 3: scattered debris -- rows with only tiny fragments
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
if longest <= 2:
junk_row_indices.add(ri)
continue
if junk_row_indices:
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
logger.info(
"build-grid: removed %d junk rows from zone %d: %s",
len(junk_row_indices), z["zone_index"],
sorted(junk_row_indices),
)
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
"""Remove individual cells with a single very-short, low-conf word."""
_ARTIFACT_MAX_LEN = 2
_ARTIFACT_CONF_THRESHOLD = 65
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
artifact_ids = set()
for cell in cells:
wbs = cell.get("word_boxes") or []
if len(wbs) != 1:
continue
wb = wbs[0]
text = (wb.get("text") or "").strip()
conf = wb.get("conf", 100)
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
artifact_ids.add(cell.get("cell_id"))
if artifact_ids:
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
logger.info(
"build-grid: removed %d artifact cells from zone %d: %s",
len(artifact_ids), z.get("zone_index", 0),
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
)
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wh = [
wb["height"]
for cell in cells
for wb in cell.get("word_boxes") or []
if wb.get("height", 0) > 0
]
if not all_wh:
continue
med_h = sorted(all_wh)[len(all_wh) // 2]
oversized_threshold = med_h * 3
removed_oversized = 0
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
if len(filtered) < len(wbs):
removed_oversized += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_oversized:
z["cells"] = [c for c in cells if c.get("word_boxes")]
logger.info(
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
removed_oversized, oversized_threshold, z.get("zone_index", 0),
)
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
"""Remove pipe-character word_boxes (column divider artifacts)."""
for z in zones_data:
if z.get("vsplit_group") is not None:
continue # pipes already removed before split
removed_pipes = 0
for cell in z.get("cells", []):
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
if len(filtered) < len(wbs):
removed_pipes += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
if removed_pipes:
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"build-grid: removed %d pipe-divider word_boxes from zone %d",
removed_pipes, z.get("zone_index", 0),
)
# Strip pipe chars ONLY from cell edges (OCR artifacts).
# Preserve pipes embedded in words as syllable separators.
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if "|" in text:
cleaned = text.strip("|").strip()
if cleaned != text.strip():
cell["text"] = cleaned
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Normalize narrow connector columns where OCR appends noise chars.
In synonym dictionaries a narrow column repeats the same word
(e.g. "oder") in every row. OCR sometimes appends noise chars.
"""
for z in zones_data:
cols = z.get("columns", [])
cells = z.get("cells", [])
if not cols or not cells:
continue
for col in cols:
ci = col.get("index")
col_cells = [c for c in cells if c.get("col_index") == ci]
if len(col_cells) < 3:
continue
text_counts: Dict[str, int] = {}
for c in col_cells:
t = (c.get("text") or "").strip()
if t:
text_counts[t] = text_counts.get(t, 0) + 1
if not text_counts:
continue
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
dominant_count = text_counts[dominant_text]
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
continue
fixed = 0
for c in col_cells:
t = (c.get("text") or "").strip()
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
c["text"] = dominant_text
wbs = c.get("word_boxes") or []
if len(wbs) == 1:
wbs[0]["text"] = dominant_text
fixed += 1
if fixed:
logger.info(
"build-grid: normalized %d outlier cells in connector column %d "
"(dominant='%s') zone %d",
fixed, ci, dominant_text, z.get("zone_index", 0),
)
def _remove_border_strips(
zones_data: List[Dict[str, Any]],
border_prefiltered: bool,
) -> bool:
"""Detect and remove page-border decoration strips.
Returns updated border_prefiltered flag.
"""
border_strip_removed = 0
if border_prefiltered:
logger.info("Step 4e: skipped (border pre-filter already applied)")
return border_prefiltered
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wbs_with_cell: list = []
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
all_wbs_with_cell.sort(key=lambda t: t[0])
total = len(all_wbs_with_cell)
# -- Left-edge scan --
left_strip_count = 0
left_gap = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
)
gap = all_wbs_with_cell[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
left_gap = gap
break
# -- Right-edge scan --
right_strip_count = 0
right_gap = 0
running_left = all_wbs_with_cell[-1][0]
for gi in range(total - 1, 0, -1):
running_left = min(running_left, all_wbs_with_cell[gi][0])
prev_right = (
all_wbs_with_cell[gi - 1][0]
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
)
gap = running_left - prev_right
if gap > 30:
right_strip_count = total - gi
right_gap = gap
break
strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
strip_gap, strip_count, total,
)
return border_prefiltered
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
"""Remove decorative edge columns (alphabet sidebar safety net).
Dictionary pages have A-Z letter sidebars that OCR reads as single-
character word_boxes.
"""
for z in zones_data:
columns = z.get("columns", [])
cells = z.get("cells", [])
if len(columns) < 3 or not cells:
continue
col_cells: Dict[str, List[Dict]] = {}
for cell in cells:
ct = cell.get("col_type", "")
if ct.startswith("column_"):
col_cells.setdefault(ct, []).append(cell)
col_types_ordered = sorted(col_cells.keys())
if len(col_types_ordered) < 3:
continue
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
edge_cells_list = col_cells.get(edge_ct, [])
if len(edge_cells_list) < 3:
continue
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
avg_len = sum(len(t) for t in texts) / len(texts)
single_char = sum(1 for t in texts if len(t) <= 1)
single_ratio = single_char / len(texts)
if avg_len > 1.5:
continue
if single_ratio < 0.7:
continue
removed_count = len(edge_cells_list)
edge_ids = {id(c) for c in edge_cells_list}
z["cells"] = [c for c in cells if id(c) not in edge_ids]
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
logger.info(
"Step 4f: removed decorative edge column '%s' from zone %d "
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
edge_ct, z.get("zone_index", 0), removed_count,
avg_len, single_ratio * 100,
)
break # only remove one edge per zone

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,452 @@
"""
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
dictionary detection, syllable dividers, spell checking, empty column
removal, and result assembly.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional
from grid_build_cell_ops import (
_remove_bullets_and_artifacts,
_remove_garbled_cells,
_normalize_word_order,
_enforce_max_columns,
)
logger = logging.getLogger(__name__)
def _finalize_grid(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
session_id: str,
max_columns: Optional[int],
ipa_mode: str,
syllable_mode: str,
en_col_type: Optional[str],
ipa_target_cols: set,
all_content_cols: set,
skip_ipa: bool,
document_category: Optional[str],
margin_strip_detected: bool,
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
) -> dict:
"""Run final processing steps and assemble result dict.
Handles: bullet removal, artifact cells, word ordering, max_columns,
dictionary detection, syllable dividers, spell check, empty columns,
internal flag cleanup, and result assembly.
"""
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
# 5i. Remove blue bullet/artifact word_boxes
_remove_bullets_and_artifacts(zones_data)
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
_remove_garbled_cells(zones_data)
# 5j. Normalise word_box order to reading order
_normalize_word_order(zones_data)
# 5k. Enforce max_columns by merging narrowest columns
if max_columns and max_columns > 0:
_enforce_max_columns(zones_data, max_columns)
# --- Dictionary detection on assembled grid ---
dict_detection = _detect_dictionary(
zones_data, img_w, img_h, document_category, margin_strip_detected
)
# --- Word-gap merge ---
try:
from cv_syllable_detect import merge_word_gaps_in_zones
merge_word_gaps_in_zones(zones_data, session_id)
except Exception as e:
logger.warning("Word-gap merge failed: %s", e)
# --- Pipe auto-correction ---
try:
from cv_syllable_detect import autocorrect_pipe_artifacts
autocorrect_pipe_artifacts(zones_data, session_id)
except Exception as e:
logger.warning("Pipe autocorrect failed: %s", e)
# --- Syllable divider insertion ---
syllable_insertions = _insert_syllable_dividers(
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
en_col_type, all_content_cols, total_cols,
)
# --- Split merged words ---
_split_merged_words(zones_data, session_id)
# --- Ensure space before IPA/phonetic brackets ---
_fix_ipa_spacing(zones_data)
# --- SmartSpellChecker ---
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
# --- Debug log cell counts per column ---
for z in zones_data:
if z.get("zone_type") == "content":
from collections import Counter as _Counter
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
_cols = z.get("columns", [])
logger.info(
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
)
# --- Remove empty columns ---
_remove_empty_columns(zones_data)
# Clean up internal flags before returning
for z in zones_data:
for cell in z.get("cells", []):
cell.pop("_ipa_corrected", None)
# 6. Build result
return _assemble_result(
zones_data, all_words, img_w, img_h, session_id,
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
dict_detection, page_number_info, boxes_detected,
recovered_count, duration, syllable_insertions,
)
def _detect_dictionary(
zones_data: List[Dict[str, Any]],
img_w: int,
img_h: int,
document_category: Optional[str],
margin_strip_detected: bool,
) -> Dict[str, Any]:
"""Run dictionary detection on the assembled grid."""
from cv_layout import _score_dictionary_signals
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
try:
from cv_vocab_types import ColumnGeometry
for z in zones_data:
zone_cells = z.get("cells", [])
zone_cols = z.get("columns", [])
if len(zone_cols) < 2 or len(zone_cells) < 10:
continue
pseudo_geoms = []
for col in zone_cols:
ci = col["index"]
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
col_words = []
for cell in col_cells:
for wb in cell.get("word_boxes") or []:
col_words.append({
"text": wb.get("text", ""),
"conf": wb.get("conf", 0),
"top": wb.get("top", 0),
"left": wb.get("left", 0),
"height": wb.get("height", 0),
"width": wb.get("width", 0),
})
if not cell.get("word_boxes") and cell.get("text"):
col_words.append({
"text": cell["text"],
"conf": cell.get("confidence", 50),
"top": cell.get("bbox_px", {}).get("y", 0),
"left": cell.get("bbox_px", {}).get("x", 0),
"height": cell.get("bbox_px", {}).get("h", 20),
"width": cell.get("bbox_px", {}).get("w", 50),
})
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
pseudo_geoms.append(ColumnGeometry(
index=ci, x=col.get("x_min_px", 0), y=0,
width=max(col_w, 1), height=img_h,
word_count=len(col_words), words=col_words,
width_ratio=col_w / max(img_w, 1),
))
if len(pseudo_geoms) >= 2:
dd = _score_dictionary_signals(
pseudo_geoms,
document_category=document_category,
margin_strip_detected=margin_strip_detected,
)
if dd["confidence"] > dict_detection["confidence"]:
dict_detection = dd
except Exception as e:
logger.warning("Dictionary detection failed: %s", e)
return dict_detection
def _insert_syllable_dividers(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
session_id: str,
syllable_mode: str,
dict_detection: Dict[str, Any],
en_col_type: Optional[str],
all_content_cols: set,
total_cols: int,
) -> int:
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
syllable_insertions = 0
if syllable_mode == "none" or img_bgr is None:
if syllable_mode == "none":
for z in zones_data:
for cell in z.get("cells", []):
t = cell.get("text", "")
if "|" in t:
cell["text"] = t.replace("|", "")
return syllable_insertions
_syllable_eligible = False
if syllable_mode in ("all", "de", "en"):
_syllable_eligible = True
elif (dict_detection.get("is_dictionary")
and dict_detection.get("article_col_index") is not None):
_syllable_eligible = True
_syllable_col_filter: Optional[set] = None
if syllable_mode == "en":
_syllable_col_filter = {en_col_type} if en_col_type else set()
elif syllable_mode == "de":
if en_col_type and total_cols >= 3:
_syllable_col_filter = all_content_cols - {en_col_type}
if _syllable_eligible:
try:
from cv_syllable_detect import insert_syllable_dividers
force_syllables = (syllable_mode in ("all", "de", "en"))
syllable_insertions = insert_syllable_dividers(
zones_data, img_bgr, session_id,
force=force_syllables,
col_filter=_syllable_col_filter,
)
except Exception as e:
logger.warning("Syllable insertion failed: %s", e)
return syllable_insertions
def _split_merged_words(
zones_data: List[Dict[str, Any]],
session_id: str,
) -> None:
"""Split merged words using dictionary lookup."""
try:
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
if not _SPELL_AVAILABLE:
return
split_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text:
continue
parts = []
changed = False
for token in text.split():
clean = token
bracket_pos = clean.find('[')
suffix_ipa = ""
if bracket_pos > 0:
suffix_ipa = clean[bracket_pos:]
clean = clean[:bracket_pos]
suffix_punct = ""
stripped = clean.rstrip(".,!?;:'\")")
if stripped != clean:
suffix_punct = clean[len(stripped):]
clean = stripped
suffix = suffix_punct + suffix_ipa
contraction = ""
if "'" in clean and clean.index("'") >= 2:
apos_pos = clean.index("'")
contraction = clean[apos_pos:]
clean = clean[:apos_pos]
suffix = contraction + suffix
if len(clean) >= 4 and clean.isalpha():
split = _try_split_merged_word(clean)
if split:
parts.append(split + suffix)
changed = True
continue
parts.append(token)
if changed:
cell["text"] = " ".join(parts)
split_count += 1
if split_count:
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
except ImportError:
pass
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if text and "[" in text:
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
if fixed != text:
cell["text"] = fixed
def _run_spell_checker(
zones_data: List[Dict[str, Any]],
session_id: str,
en_col_type: Optional[str],
total_cols: int,
) -> None:
"""Run SmartSpellChecker on all cells."""
try:
from smart_spell import SmartSpellChecker
_ssc = SmartSpellChecker()
spell_fix_count = 0
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if not text or not text.strip():
continue
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
if total_cols >= 3 and en_col_type:
lang = "en" if ct == en_col_type else "de"
elif total_cols <= 2:
lang = "auto"
else:
lang = "auto"
result = _ssc.correct_text(text, lang=lang)
if result.changed:
cell["text"] = result.corrected
spell_fix_count += 1
if spell_fix_count:
logger.info(
"build-grid session %s: SmartSpellChecker fixed %d cells",
session_id, spell_fix_count,
)
except ImportError:
logger.debug("SmartSpellChecker not available in build-grid")
except Exception as e:
logger.warning("SmartSpellChecker error in build-grid: %s", e)
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
"""Remove columns that have no cells assigned."""
for z in zones_data:
cells = z.get("cells", [])
used_col_indices = {c.get("col_index") for c in cells}
old_cols = z.get("columns", [])
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
if len(new_cols) < len(old_cols):
old_to_new = {}
for new_i, col in enumerate(new_cols):
old_i = col.get("col_index", col.get("index", new_i))
old_to_new[old_i] = new_i
col["col_index"] = new_i
col["index"] = new_i
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
for cell in cells:
old_ci = cell.get("col_index", 0)
cell["col_index"] = old_to_new.get(old_ci, old_ci)
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
z["columns"] = new_cols
def _assemble_result(
zones_data: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
img_w: int,
img_h: int,
session_id: str,
ipa_mode: str,
syllable_mode: str,
ipa_target_cols: set,
skip_ipa: bool,
dict_detection: Dict[str, Any],
page_number_info: Optional[Dict],
boxes_detected: int,
recovered_count: int,
duration: float,
syllable_insertions: int,
) -> dict:
"""Build the final result dict (Phase 6)."""
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
# Collect color statistics
color_stats: Dict[str, int] = {}
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
cn = wb.get("color_name", "black")
color_stats[cn] = color_stats.get(cn, 0) + 1
# Compute layout metrics
all_content_row_heights: List[float] = []
for z in zones_data:
for row in z.get("rows", []):
if not row.get("is_header", False):
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
if h > 0:
all_content_row_heights.append(h)
avg_row_height = (
sum(all_content_row_heights) / len(all_content_row_heights)
if all_content_row_heights else 30.0
)
font_size_suggestion = max(10, int(avg_row_height * 0.6))
return {
"session_id": session_id,
"image_width": img_w,
"image_height": img_h,
"zones": zones_data,
"boxes_detected": boxes_detected,
"summary": {
"total_zones": len(zones_data),
"total_columns": total_columns,
"total_rows": total_rows,
"total_cells": total_cells,
"total_words": len(all_words),
"recovered_colored": recovered_count,
"color_stats": color_stats,
},
"formatting": {
"bold_columns": [],
"header_rows": [],
},
"layout_metrics": {
"page_width_px": img_w,
"page_height_px": img_h,
"avg_row_height_px": round(avg_row_height, 1),
"font_size_suggestion_px": font_size_suggestion,
},
"dictionary_detection": {
"is_dictionary": dict_detection.get("is_dictionary", False),
"confidence": dict_detection.get("confidence", 0.0),
"signals": dict_detection.get("signals", {}),
"article_col_index": dict_detection.get("article_col_index"),
"headword_col_index": dict_detection.get("headword_col_index"),
},
"processing_modes": {
"ipa_mode": ipa_mode,
"syllable_mode": syllable_mode,
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
"syllables_applied": syllable_insertions > 0,
},
"page_number": page_number_info,
"duration_seconds": round(duration, 2),
}

View File

@@ -0,0 +1,489 @@
"""
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
parenthesis fix, IPA phonetic correction, page ref extraction, and
slash-IPA conversion.
Extracted from grid_build_core.py for maintainability.
"""
import logging
import re
from typing import Any, Dict, List, Optional, Set, Tuple
from cv_color_detect import detect_word_colors
from cv_ocr_engines import (
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
_lookup_ipa,
)
from grid_editor_helpers import (
_detect_heading_rows_by_color,
_detect_heading_rows_by_single_cell,
)
logger = logging.getLogger(__name__)
def _process_text(
zones_data: List[Dict[str, Any]],
img_bgr: Any,
img_w: int,
img_h: int,
ipa_mode: str,
page_number_info: Optional[Dict],
) -> Dict[str, Any]:
"""Run color annotation, heading detection, IPA correction, and page refs.
Args:
zones_data: List of zone dicts (modified in place).
img_bgr: BGR image array (or None).
img_w: Image width.
img_h: Image height.
ipa_mode: IPA processing mode.
page_number_info: Existing page number metadata (may be None).
Returns:
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
skip_ipa, page_number_info.
"""
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None:
all_wb: List[Dict] = []
for z in zones_data:
for cell in z.get("cells", []):
all_wb.extend(cell.get("word_boxes", []))
detect_word_colors(img_bgr, all_wb)
# 5a. Heading detection by color + height
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
if heading_count:
logger.info("Detected %d heading rows by color+height", heading_count)
# 5b. Fix unmatched parentheses in cell text
for z in zones_data:
for cell in z.get("cells", []):
text = cell.get("text", "")
if ")" in text and "(" not in text:
cell["text"] = "(" + text
# 5c. IPA phonetic correction
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
en_col_type = None
ipa_target_cols: set = set()
all_content_cols: set = set()
skip_ipa = (ipa_mode == "none")
# When ipa_mode=none, strip ALL square brackets from ALL content columns
if skip_ipa:
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
if not skip_ipa and total_cols >= 3:
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
all_cells, total_cols, ipa_mode, zones_data
)
elif not skip_ipa:
# Collect all_content_cols even when <3 cols (needed by finalize)
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
all_content_cols.add(ct)
# 5e. Heading detection by single-cell rows
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
if single_heading_count:
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
# 5f. Strip IPA from headings
for z in zones_data:
for cell in z.get("cells", []):
if cell.get("col_type") != "heading":
continue
text = cell.get("text", "")
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
if stripped and stripped != text:
cell["text"] = stripped
# 5g. Extract page_ref cells and footer rows
_extract_page_refs_and_footers(zones_data, page_number_info)
# 5h. Convert slash-delimited IPA to bracket notation
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
return {
"en_col_type": en_col_type,
"ipa_target_cols": ipa_target_cols,
"all_content_cols": all_content_cols,
"skip_ipa": skip_ipa,
"page_number_info": page_number_info,
}
def _run_ipa_correction(
all_cells: List[Dict],
total_cols: int,
ipa_mode: str,
zones_data: List[Dict[str, Any]],
) -> Tuple[Optional[str], set, set]:
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
en_col_type = None
all_content_cols: set = set()
# Detect English headword column via IPA signals
col_ipa_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
txt = cell.get("text", "") or ""
if txt.strip():
all_content_cols.add(ct)
if '[' in txt or _text_has_garbled_ipa(txt):
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
if col_ipa_count:
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
elif ipa_mode == "all":
col_cell_count: Dict[str, int] = {}
for cell in all_cells:
ct = cell.get("col_type", "")
if ct.startswith("column_") and (cell.get("text") or "").strip():
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
if col_cell_count:
en_col_type = max(col_cell_count, key=col_cell_count.get)
# Decide which columns to process based on ipa_mode
en_ipa_target_cols: set = set()
de_ipa_target_cols: set = set()
if ipa_mode in ("auto", "en"):
if en_col_type:
en_ipa_target_cols.add(en_col_type)
elif ipa_mode == "de":
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
elif ipa_mode == "all":
if en_col_type:
en_ipa_target_cols.add(en_col_type)
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
# --- Strip IPA from columns NOT in the target set ---
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
if strip_en_ipa or ipa_mode == "none":
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
for cell in all_cells:
ct = cell.get("col_type", "")
if ct not in strip_cols:
continue
text = cell.get("text", "")
if "[" in text:
stripped = _SQUARE_BRACKET_RE.sub("", text)
if stripped != text:
cell["text"] = stripped.strip()
cell["_ipa_corrected"] = True
# --- English IPA (Britfone + eng_to_ipa) ---
if en_ipa_target_cols:
for cell in all_cells:
ct = cell.get("col_type")
if ct in en_ipa_target_cols:
cell["_orig_col_type"] = ct
cell["col_type"] = "column_en"
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
fix_cell_phonetics(all_cells, pronunciation="british")
for cell in all_cells:
orig = cell.pop("_orig_col_type", None)
if orig:
cell["col_type"] = orig
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# --- German IPA (wiki-pronunciation-dict + epitran) ---
if de_ipa_target_cols:
from cv_ipa_german import insert_german_ipa
insert_german_ipa(all_cells, de_ipa_target_cols)
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
# Mark cells whose text was changed by IPA correction
for cell in all_cells:
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
cell["_ipa_corrected"] = True
# 5d. Fix IPA continuation cells
skip_ipa = (ipa_mode == "none")
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
ipa_cont_fixed = 0
for z in ([] if skip_ipa else zones_data):
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
z_cells = z.get("cells", [])
for idx, row in enumerate(rows_sorted):
if idx == 0:
continue
ri = row["index"]
row_cells = [c for c in z_cells if c.get("row_index") == ri]
for cell in row_cells:
ct = cell.get("col_type", "")
if not ct.startswith("column_"):
continue
cell_text = (cell.get("text") or "").strip()
if not cell_text:
wb_texts = [w.get("text", "")
for w in cell.get("word_boxes", [])]
cell_text = " ".join(wb_texts).strip()
if not cell_text:
continue
is_bracketed = (
cell_text.startswith('[') and cell_text.endswith(']')
)
if is_bracketed:
if not _text_has_garbled_ipa(cell_text):
continue
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
continue
else:
content_cells_in_row = [
c for c in row_cells
if c.get("col_type", "").startswith("column_")
and c.get("col_type") != "column_1"
]
if len(content_cells_in_row) != 1:
continue
if not _text_has_garbled_ipa(cell_text):
continue
if any(c in _REAL_IPA_CHARS for c in cell_text):
continue
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
if len(_words_in_text) >= 3:
continue
# Find headword in previous row, same column
prev_ri = rows_sorted[idx - 1]["index"]
prev_same_col = [
c for c in z_cells
if c.get("row_index") == prev_ri
and c.get("col_type") == ct
]
if not prev_same_col:
continue
prev_text = prev_same_col[0].get("text", "")
fixed = fix_ipa_continuation_cell(
cell_text, prev_text, pronunciation="british",
)
if fixed != cell_text:
cell["text"] = fixed
ipa_cont_fixed += 1
logger.info(
"IPA continuation R%d %s: '%s' -> '%s'",
ri, ct, cell_text, fixed,
)
if ipa_cont_fixed:
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
return en_col_type, ipa_target_cols, all_content_cols
def _extract_page_refs_and_footers(
zones_data: List[Dict[str, Any]],
page_number_info: Optional[Dict],
) -> None:
"""Extract page_ref cells and footer rows from content zones.
Modifies zones_data in place. Updates page_number_info if a page number
footer is found.
"""
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
_NUMBER_WORDS = {
"one", "two", "three", "four", "five", "six", "seven",
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
"einhundert", "zweihundert", "dreihundert", "vierhundert",
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
}
for z in zones_data:
if z.get("zone_type") != "content":
continue
cells = z.get("cells", [])
rows = z.get("rows", [])
if not rows:
continue
# Extract column_1 cells that look like page references
page_refs = []
page_ref_cell_ids = set()
for cell in cells:
if cell.get("col_type") != "column_1":
continue
text = (cell.get("text") or "").strip()
if not text:
continue
if not _PAGE_REF_RE.match(text):
continue
page_refs.append({
"row_index": cell.get("row_index"),
"text": text,
"bbox_pct": cell.get("bbox_pct", {}),
})
page_ref_cell_ids.add(cell.get("cell_id"))
# Detect footer: last non-header row if it has only 1 cell
footer_rows = []
non_header_rows = [r for r in rows if not r.get("is_header")]
if non_header_rows:
last_row = non_header_rows[-1]
last_ri = last_row["index"]
last_cells = [c for c in z["cells"]
if c.get("row_index") == last_ri]
if len(last_cells) == 1:
text = (last_cells[0].get("text") or "").strip()
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
has_commas = ',' in text
text_words = set(text.lower().split())
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
is_page_number = len(text) <= 20 or is_written_number
if (text and not has_real_ipa and not has_commas
and is_page_number
and last_cells[0].get("col_type") != "heading"):
footer_rows.append({
"row_index": last_ri,
"text": text,
"bbox_pct": last_cells[0].get("bbox_pct", {}),
})
# Classify footer rows
page_number_footers = []
other_footers = []
for fr in footer_rows:
ft = fr["text"].strip()
digits = "".join(c for c in ft if c.isdigit())
if digits and re.match(r'^[\d\s.]+$', ft):
page_number_footers.append(fr)
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
page_number_footers.append(fr)
else:
other_footers.append(fr)
# Remove page-number footer rows from grid entirely
if page_number_footers:
pn_ris = {fr["row_index"] for fr in page_number_footers}
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
pn_text = page_number_footers[0]["text"].strip()
pn_digits = "".join(c for c in pn_text if c.isdigit())
if not page_number_info:
page_number_info = {
"text": pn_text,
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
}
if pn_digits:
page_number_info["number"] = int(pn_digits)
# Mark remaining footer rows
if other_footers:
footer_ris = {fr["row_index"] for fr in other_footers}
for r in z["rows"]:
if r["index"] in footer_ris:
r["is_footer"] = True
for c in z["cells"]:
if c.get("row_index") in footer_ris:
c["col_type"] = "footer"
if page_refs or footer_rows:
logger.info(
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
len(page_refs), len(footer_rows), len(page_number_footers),
z.get("zone_index", 0),
)
if page_refs:
z["page_refs"] = page_refs
if other_footers:
z["footer"] = other_footers
def _convert_slash_ipa(
zones_data: List[Dict[str, Any]],
skip_ipa: bool,
en_col_type: Optional[str],
) -> None:
"""Convert slash-delimited IPA to bracket notation.
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
"""
_SLASH_IPA_RE = re.compile(
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
)
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
slash_ipa_fixed = 0
for z in ([] if skip_ipa else zones_data):
for cell in z.get("cells", []):
if en_col_type and cell.get("col_type") != en_col_type:
continue
text = cell.get("text", "")
if "/" not in text:
continue
def _replace_slash_ipa(m: re.Match) -> str:
nonlocal slash_ipa_fixed
headword = m.group(1)
ocr_ipa = m.group(2)
inner_raw = ocr_ipa.strip("/").strip()
if _SLASH_IPA_REJECT_RE.search(inner_raw):
return m.group(0)
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
if ipa:
slash_ipa_fixed += 1
return f"{headword} [{ipa}]"
inner = inner_raw.lstrip("'").strip()
if inner:
slash_ipa_fixed += 1
return f"{headword} [{inner}]"
return m.group(0)
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
def _replace_trailing_slash(m: re.Match) -> str:
nonlocal slash_ipa_fixed
inner = m.group(1).strip("/").strip().lstrip("'").strip()
if _SLASH_IPA_REJECT_RE.search(inner):
return m.group(0)
if inner:
slash_ipa_fixed += 1
return f" [{inner}]"
return m.group(0)
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
if new_text == text:
m = _STANDALONE_SLASH_IPA_RE.match(text)
if m:
inner = m.group(1).strip()
if not _SLASH_IPA_REJECT_RE.search(inner):
inner = inner.lstrip("'").strip()
if inner:
new_text = "[" + inner + "]" + text[m.end():]
slash_ipa_fixed += 1
if new_text != text:
cell["text"] = new_text
if slash_ipa_fixed:
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)

View File

@@ -0,0 +1,462 @@
"""
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
detection and zone-aware grid building.
Extracted from grid_build_core.py for maintainability.
"""
import logging
from typing import Any, Dict, List, Optional
import cv2
import numpy as np
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_graphic_detect import detect_graphic_elements
from cv_color_detect import recover_colored_text
from cv_vocab_types import PageZone
from ocr_pipeline_session_store import get_session_image
from grid_editor_helpers import (
_filter_border_strip_words,
_filter_border_ghosts,
_words_in_zone,
_PIPE_RE_VSPLIT,
_detect_vertical_dividers,
_split_zone_at_vertical_dividers,
_merge_content_zones_across_boxes,
_build_zone_grid,
)
logger = logging.getLogger(__name__)
async def _build_zones(
session_id: str,
session: dict,
all_words: List[Dict[str, Any]],
graphic_rects: List[Dict[str, int]],
content_x: int,
content_y: int,
content_w: int,
content_h: int,
img_w: int,
img_h: int,
) -> Dict[str, Any]:
"""Load image, detect graphics/boxes, build zone-aware grids.
Returns a dict with keys:
zones_data, boxes_detected, recovered_count, border_prefiltered,
img_bgr, all_words (modified in-place but returned for clarity).
"""
zones_data: List[Dict[str, Any]] = []
boxes_detected = 0
recovered_count = 0
border_prefiltered = False
img_bgr = None
# 3. Load image for box detection
img_png = await get_session_image(session_id, "cropped")
if not img_png:
img_png = await get_session_image(session_id, "dewarped")
if not img_png:
img_png = await get_session_image(session_id, "original")
if img_png:
# Decode image for color detection + box detection
arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img_bgr is not None:
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
if fresh_graphics:
fresh_rects = [
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
for g in fresh_graphics
]
graphic_rects.extend(fresh_rects)
logger.info(
"build-grid session %s: detected %d graphic region(s) via CV",
session_id, len(fresh_graphics),
)
# Hard-filter words inside newly detected graphic regions
before = len(all_words)
all_words[:] = [
w for w in all_words
if not any(
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in fresh_rects
)
]
removed = before - len(all_words)
if removed:
logger.info(
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
session_id, removed, len(fresh_rects),
)
# --- Recover colored text that OCR missed (before grid building) ---
recovered = recover_colored_text(img_bgr, all_words)
if recovered and graphic_rects:
# Filter recovered chars inside graphic regions
recovered = [
r for r in recovered
if not any(
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in graphic_rects
)
]
if recovered:
recovered_count = len(recovered)
all_words.extend(recovered)
logger.info(
"build-grid session %s: +%d recovered colored words",
session_id, recovered_count,
)
# Detect bordered boxes
boxes = detect_boxes(
img_bgr,
content_x=content_x,
content_w=content_w,
content_y=content_y,
content_h=content_h,
)
boxes_detected = len(boxes)
if boxes:
# Filter border ghost words before grid building
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
if ghost_count:
all_words[:] = all_words_new
logger.info(
"build-grid session %s: removed %d border ghost words",
session_id, ghost_count,
)
# Split page into zones
page_zones = split_page_into_zones(
content_x, content_y, content_w, content_h, boxes
)
# Merge content zones separated by box zones
page_zones = _merge_content_zones_across_boxes(
page_zones, content_x, content_w
)
# 3b. Detect vertical dividers and split content zones
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
page_zones, all_words
)
# --- First pass: build grids per zone independently ---
zone_grids = _build_grids_per_zone(
page_zones, all_words, img_w, img_h
)
border_prefiltered = border_prefiltered or any(
zg.get("_border_prefiltered") for zg in zone_grids
)
# --- Second pass: merge column boundaries from all content zones ---
_merge_content_zone_columns(
zone_grids, all_words, content_w, img_w, img_h, session_id
)
# --- Build zones_data from zone_grids ---
for zg in zone_grids:
pz = zg["pz"]
grid = zg["grid"]
grid.pop("_raw_columns", None)
zone_entry: Dict[str, Any] = {
"zone_index": pz.index,
"zone_type": pz.zone_type,
"bbox_px": {
"x": pz.x, "y": pz.y,
"w": pz.width, "h": pz.height,
},
"bbox_pct": {
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(zg["words"]),
**grid,
}
if pz.box:
zone_entry["border"] = {
"thickness": pz.box.border_thickness,
"confidence": pz.box.confidence,
}
if pz.image_overlays:
zone_entry["image_overlays"] = pz.image_overlays
if pz.layout_hint:
zone_entry["layout_hint"] = pz.layout_hint
if pz.vsplit_group is not None:
zone_entry["vsplit_group"] = pz.vsplit_group
zones_data.append(zone_entry)
# 4. Fallback: no boxes detected -> single zone with all words
if not zones_data:
before = len(all_words)
filtered_words = [
w for w in all_words
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
]
removed = before - len(filtered_words)
if removed:
logger.info(
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed,
)
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid session %s: pre-filtered %d border-strip words",
session_id, bs_removed,
)
grid = _build_zone_grid(
filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
)
grid.pop("_raw_columns", None)
zones_data.append({
"zone_index": 0,
"zone_type": "content",
"bbox_px": {
"x": content_x, "y": content_y,
"w": content_w, "h": content_h,
},
"bbox_pct": {
"x": round(content_x / img_w * 100, 2) if img_w else 0,
"y": round(content_y / img_h * 100, 2) if img_h else 0,
"w": round(content_w / img_w * 100, 2) if img_w else 0,
"h": round(content_h / img_h * 100, 2) if img_h else 0,
},
"border": None,
"word_count": len(all_words),
**grid,
})
return {
"zones_data": zones_data,
"boxes_detected": boxes_detected,
"recovered_count": recovered_count,
"border_prefiltered": border_prefiltered,
"img_bgr": img_bgr,
}
def _detect_and_split_vertical_dividers(
page_zones: List[PageZone],
all_words: List[Dict[str, Any]],
) -> tuple:
"""Detect vertical dividers and split content zones.
Returns (expanded_zones, border_prefiltered_from_vsplit).
"""
vsplit_group_counter = 0
expanded_zones: List = []
for pz in page_zones:
if pz.zone_type != "content":
expanded_zones.append(pz)
continue
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
divider_xs = _detect_vertical_dividers(
zone_words, pz.x, pz.width, pz.y, pz.height
)
if divider_xs:
sub_zones = _split_zone_at_vertical_dividers(
pz, divider_xs, vsplit_group_counter
)
expanded_zones.extend(sub_zones)
vsplit_group_counter += 1
# Remove pipe words so they don't appear in sub-zones
pipe_ids = set(
id(w) for w in zone_words
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
)
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
logger.info(
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
pz.index, [int(x) for x in divider_xs], len(sub_zones),
)
else:
expanded_zones.append(pz)
# Re-index zones
for i, pz in enumerate(expanded_zones):
pz.index = i
return expanded_zones, False
def _build_grids_per_zone(
page_zones: List[PageZone],
all_words: List[Dict[str, Any]],
img_w: int,
img_h: int,
) -> List[Dict[str, Any]]:
"""Build grids for each zone independently (first pass)."""
zone_grids: List[Dict] = []
for pz in page_zones:
zone_words = _words_in_zone(
all_words, pz.y, pz.height, pz.x, pz.width
)
if pz.zone_type == "content":
logger.info(
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
pz.index, pz.zone_type,
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
len(zone_words), len(all_words),
)
# Filter recovered single-char artifacts in ALL zones
before = len(zone_words)
zone_words = [
w for w in zone_words
if not (
w.get("recovered")
and len(w.get("text", "").strip()) <= 2
)
]
removed = before - len(zone_words)
if removed:
logger.info(
"build-grid: filtered %d recovered artifacts from %s zone %d",
removed, pz.zone_type, pz.index,
)
# Filter words inside image overlay regions (merged box zones)
if pz.image_overlays:
before_ov = len(zone_words)
zone_words = [
w for w in zone_words
if not any(
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
for ov in pz.image_overlays
)
]
ov_removed = before_ov - len(zone_words)
if ov_removed:
logger.info(
"build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index,
)
zone_words, bs_removed = _filter_border_strip_words(zone_words)
bp = False
if bs_removed:
bp = True
logger.info(
"build-grid: pre-filtered %d border-strip words from zone %d",
bs_removed, pz.index,
)
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
skip_first_row_header=bool(pz.image_overlays),
)
zone_grids.append({
"pz": pz, "words": zone_words, "grid": grid,
"_border_prefiltered": bp,
})
return zone_grids
def _merge_content_zone_columns(
zone_grids: List[Dict[str, Any]],
all_words: List[Dict[str, Any]],
content_w: int,
img_w: int,
img_h: int,
session_id: str,
) -> None:
"""Second pass: merge column boundaries from all content zones.
Modifies zone_grids in place.
"""
content_zones = [
zg for zg in zone_grids
if zg["pz"].zone_type == "content"
and zg["pz"].vsplit_group is None
]
if len(content_zones) <= 1:
return
# Collect column split points (x_min of non-first columns)
all_split_xs: List[float] = []
for zg in content_zones:
raw_cols = zg["grid"].get("_raw_columns", [])
for col in raw_cols[1:]:
all_split_xs.append(col["x_min"])
if not all_split_xs:
return
all_split_xs.sort()
merge_distance = max(25, int(content_w * 0.03))
merged_xs = [all_split_xs[0]]
for x in all_split_xs[1:]:
if x - merged_xs[-1] < merge_distance:
merged_xs[-1] = (merged_xs[-1] + x) / 2
else:
merged_xs.append(x)
total_cols = len(merged_xs) + 1
max_zone_cols = max(
len(zg["grid"].get("_raw_columns", []))
for zg in content_zones
)
if total_cols < max_zone_cols:
return
cx_min = min(w["left"] for w in all_words)
cx_max = max(w["left"] + w["width"] for w in all_words)
merged_columns: List[Dict[str, Any]] = []
prev_x = cx_min
for i, sx in enumerate(merged_xs):
merged_columns.append({
"index": i,
"type": f"column_{i + 1}",
"x_min": prev_x,
"x_max": sx,
})
prev_x = sx
merged_columns.append({
"index": len(merged_xs),
"type": f"column_{len(merged_xs) + 1}",
"x_min": prev_x,
"x_max": cx_max,
})
# Re-build ALL content zones with merged columns
for zg in zone_grids:
pz = zg["pz"]
if pz.zone_type == "content":
grid = _build_zone_grid(
zg["words"], pz.x, pz.y,
pz.width, pz.height,
pz.index, img_w, img_h,
global_columns=merged_columns,
skip_first_row_header=bool(pz.image_overlays),
)
zg["grid"] = grid
logger.info(
"build-grid session %s: union of %d content "
"zones -> %d merged columns (max single zone: %d)",
session_id, len(content_zones),
total_cols, max_zone_cols,
)

View File

@@ -0,0 +1,472 @@
"""
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
extract-with-boxes, deskewed images, and learning unit generation.
The two large handlers (compare_ocr_methods, analyze_grid) live in
vocab_worksheet_compare_api.py and are included via compare_router.
"""
from fastapi import APIRouter, Body, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, Dict, Any
from datetime import datetime
import os
import io
import json
import logging
def _get_sessions():
from vocab_worksheet_api import _sessions
return _sessions
def _get_local_storage_path():
from vocab_worksheet_api import LOCAL_STORAGE_PATH
return LOCAL_STORAGE_PATH
from vocab_worksheet_generation import convert_pdf_page_to_image
# Try to import Tesseract extractor
try:
from tesseract_vocab_extractor import (
extract_bounding_boxes, TESSERACT_AVAILABLE,
)
except ImportError:
TESSERACT_AVAILABLE = False
# Try to import Grid Detection Service
try:
from services.grid_detection_service import GridDetectionService
GRID_SERVICE_AVAILABLE = True
except ImportError:
GRID_SERVICE_AVAILABLE = False
logger = logging.getLogger(__name__)
analysis_router = APIRouter()
def _ocr_export_dir():
return os.path.join(_get_local_storage_path(), "ocr-exports")
def _ground_truth_dir():
return os.path.join(_get_local_storage_path(), "ground-truth")
# =============================================================================
# OCR Export Endpoints (for cross-app OCR data sharing)
# =============================================================================
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
"""
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
serves as shared storage accessible from both ports.
"""
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
os.makedirs(_ocr_export_dir(), exist_ok=True)
# Save the export data
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
with open(export_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Update latest pointer
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
with open(latest_path, 'w', encoding='utf-8') as f:
json.dump({
"session_id": session_id,
"page_number": page_number,
"saved_at": datetime.utcnow().isoformat(),
}, f, ensure_ascii=False, indent=2)
return {
"success": True,
"session_id": session_id,
"page_number": page_number,
"message": "OCR export saved successfully",
}
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
async def load_ocr_export(session_id: str, page_number: int):
"""Load a specific OCR export by session and page number."""
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(export_path):
raise HTTPException(status_code=404, detail="OCR export not found")
with open(export_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
@analysis_router.get("/ocr-export/latest")
async def load_latest_ocr_export():
"""Load the most recently saved OCR export data."""
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
if not os.path.exists(latest_path):
raise HTTPException(status_code=404, detail="No OCR exports found")
with open(latest_path, 'r', encoding='utf-8') as f:
pointer = json.load(f)
session_id = pointer.get("session_id")
page_number = pointer.get("page_number")
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(export_path):
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
with open(export_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# =============================================================================
# Extract with Boxes & Deskewed Image
# =============================================================================
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
Returns dict with 'entries' list and 'image_width'/'image_height'.
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
All bbox coordinates are in percent (0-100).
"""
if not TESSERACT_AVAILABLE:
raise HTTPException(status_code=500, detail="Tesseract not available")
if not GRID_SERVICE_AVAILABLE:
raise HTTPException(status_code=500, detail="GridDetectionService not available")
# Step 1: Tesseract word-level bounding boxes
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
words = tess_result.get("words", [])
img_w = tess_result.get("image_width", 0)
img_h = tess_result.get("image_height", 0)
if not words or img_w == 0 or img_h == 0:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 2: Convert to OCR regions (percentage-based)
service = GridDetectionService()
regions = service.convert_tesseract_regions(words, img_w, img_h)
if not regions:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 3: Detect grid
grid_result = service.detect_grid(regions)
if not grid_result.cells:
return {"entries": [], "image_width": img_w, "image_height": img_h}
# Step 4: Group cells by logical_row and column_type
from services.grid_detection_service import ColumnType
entries = []
for row_idx, row_cells in enumerate(grid_result.cells):
en_text = ""
de_text = ""
ex_text = ""
en_bbox = None
de_bbox = None
ex_bbox = None
row_conf_sum = 0.0
row_conf_count = 0
for cell in row_cells:
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
"w": round(cell.width, 2), "h": round(cell.height, 2)}
if cell.column_type == ColumnType.ENGLISH:
en_text = cell.text.strip()
en_bbox = cell_bbox
elif cell.column_type == ColumnType.GERMAN:
de_text = cell.text.strip()
de_bbox = cell_bbox
elif cell.column_type == ColumnType.EXAMPLE:
ex_text = cell.text.strip()
ex_bbox = cell_bbox
if cell.text.strip():
row_conf_sum += cell.confidence
row_conf_count += 1
# Skip completely empty rows
if not en_text and not de_text and not ex_text:
continue
# Calculate whole-row bounding box
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
if all_bboxes:
row_x = min(b["x"] for b in all_bboxes)
row_y = min(b["y"] for b in all_bboxes)
row_right = max(b["x"] + b["w"] for b in all_bboxes)
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
else:
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
entries.append({
"row_index": row_idx,
"english": en_text,
"german": de_text,
"example": ex_text,
"confidence": avg_conf,
"bbox": row_bbox,
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
})
return {"entries": entries, "image_width": img_w, "image_height": img_h}
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
async def extract_with_boxes(session_id: str, page_number: int):
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
Uses Tesseract + GridDetectionService for spatial positioning.
page_number is 0-indexed.
"""
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Convert page to hires image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# Deskew image before OCR
deskew_angle = 0.0
try:
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
if CV2_AVAILABLE:
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
except Exception as e:
logger.warning(f"Deskew failed for page {page_number}: {e}")
# Cache deskewed image in session for later serving
if "deskewed_images" not in session:
session["deskewed_images"] = {}
session["deskewed_images"][str(page_number)] = image_data
# Extract entries with boxes (now on deskewed image)
result = await extract_entries_with_boxes(image_data)
# Cache in session
if "gt_entries" not in session:
session["gt_entries"] = {}
session["gt_entries"][str(page_number)] = result["entries"]
return {
"success": True,
"entries": result["entries"],
"entry_count": len(result["entries"]),
"image_width": result["image_width"],
"image_height": result["image_height"],
"deskew_angle": round(deskew_angle, 2),
"deskewed": abs(deskew_angle) > 0.05,
}
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
async def get_deskewed_image(session_id: str, page_number: int):
"""Return the deskewed page image as PNG.
Falls back to the original hires image if no deskewed version is cached.
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
deskewed = session.get("deskewed_images", {}).get(str(page_number))
if deskewed:
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
# Fallback: render original hires image
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
# =============================================================================
# Ground Truth Labeling
# =============================================================================
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
"""Save ground truth labels for a page.
Expects body with 'entries' list - each entry has english, german, example,
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
"""
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
entries = data.get("entries", [])
if not entries:
raise HTTPException(status_code=400, detail="No entries provided")
# Save in session
session = _get_sessions()[session_id]
if "ground_truth" not in session:
session["ground_truth"] = {}
session["ground_truth"][str(page_number)] = entries
# Also save to disk
os.makedirs(_ground_truth_dir(), exist_ok=True)
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
gt_data = {
"session_id": session_id,
"page_number": page_number,
"saved_at": datetime.now().isoformat(),
"entry_count": len(entries),
"entries": entries,
}
with open(gt_path, 'w', encoding='utf-8') as f:
json.dump(gt_data, f, ensure_ascii=False, indent=2)
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
edited = sum(1 for e in entries if e.get("status") == "edited")
skipped = sum(1 for e in entries if e.get("status") == "skipped")
return {
"success": True,
"saved_count": len(entries),
"confirmed": confirmed,
"edited": edited,
"skipped": skipped,
"file_path": gt_path,
}
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
async def load_ground_truth(session_id: str, page_number: int):
"""Load saved ground truth for a page."""
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
# Try session cache first
session = _get_sessions()[session_id]
cached = session.get("ground_truth", {}).get(str(page_number))
if cached:
return {"success": True, "entries": cached, "source": "cache"}
# Try disk
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
if not os.path.exists(gt_path):
raise HTTPException(status_code=404, detail="No ground truth found for this page")
with open(gt_path, 'r', encoding='utf-8') as f:
gt_data = json.load(f)
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
# ─── Learning Module Generation ─────────────────────────────────────────────
class GenerateLearningUnitRequest(BaseModel):
grade: Optional[str] = None
generate_modules: bool = True
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
"""
Create a Learning Unit from the vocabulary in this session.
1. Takes vocabulary from the session
2. Creates a Learning Unit in backend-lehrer
3. Optionally triggers MC/Cloze/QA generation
Returns the created unit info and generation status.
"""
if request is None:
request = GenerateLearningUnitRequest()
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
vocabulary = session.get("vocabulary", [])
if not vocabulary:
raise HTTPException(status_code=400, detail="No vocabulary in this session")
try:
from vocab_learn_bridge import create_learning_unit, generate_learning_modules
# Step 1: Create Learning Unit
result = await create_learning_unit(
session_name=session["name"],
vocabulary=vocabulary,
grade=request.grade,
)
# Step 2: Generate modules if requested
if request.generate_modules:
try:
gen_result = await generate_learning_modules(
unit_id=result["unit_id"],
analysis_path=result["analysis_path"],
)
result["generation"] = gen_result
except Exception as e:
logger.warning(f"Module generation failed (unit created): {e}")
result["generation"] = {"status": "error", "reason": str(e)}
return result
except ImportError:
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=502, detail=str(e))
# =============================================================================
# Include compare_ocr_methods & analyze_grid from companion module
# =============================================================================
from vocab_worksheet_compare_api import compare_router # noqa: E402
analysis_router.include_router(compare_router)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,545 @@
"""
Vocabulary Worksheet Compare & Grid Analysis API.
Split from vocab_worksheet_analysis_api.py — contains the two largest
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
"""
from fastapi import APIRouter, HTTPException, Query
from typing import Dict, Any
import base64
import json
import logging
import os
from vocab_worksheet_extraction import extract_vocabulary_from_image
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
def _get_sessions():
from vocab_worksheet_api import _sessions
return _sessions
from vocab_worksheet_generation import convert_pdf_page_to_image
# Try to import Tesseract extractor
try:
from tesseract_vocab_extractor import (
run_tesseract_pipeline,
match_positions_to_vocab, TESSERACT_AVAILABLE,
)
except ImportError:
TESSERACT_AVAILABLE = False
# Try to import CV Pipeline
try:
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
except ImportError:
CV_PIPELINE_AVAILABLE = False
# Try to import Grid Detection Service
try:
from services.grid_detection_service import GridDetectionService
GRID_SERVICE_AVAILABLE = True
except ImportError:
GRID_SERVICE_AVAILABLE = False
logger = logging.getLogger(__name__)
compare_router = APIRouter()
# =============================================================================
# OCR Compare & Grid Analysis Endpoints
# =============================================================================
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
async def compare_ocr_methods(session_id: str, page_number: int):
"""
Run multiple OCR methods on a page and compare results.
This endpoint:
1. Gets the page image from the session's uploaded PDF
2. Runs Vision LLM extraction (primary method)
3. Optionally runs Tesseract extraction
4. Compares found vocabulary across methods
5. Returns structured comparison results
page_number is 0-indexed.
"""
import httpx
import time
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Convert page to image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
methods_results = {}
all_vocab_sets = {}
# --- Method: Vision LLM ---
try:
start = time.time()
vocab, confidence, error = await extract_vocabulary_from_image(
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
)
duration = time.time() - start
vocab_list = []
for v in vocab:
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
vocab_list.append({
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example": entry.get("example_sentence", ""),
})
methods_results["vision_llm"] = {
"name": "Vision LLM",
"model": VISION_MODEL,
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list),
"vocabulary": vocab_list,
"confidence": confidence,
"success": len(vocab_list) > 0 and not error,
"error": error if error else None,
}
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
except Exception as e:
logger.error(f"Vision LLM failed: {e}")
methods_results["vision_llm"] = {
"name": "Vision LLM",
"model": VISION_MODEL,
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["vision_llm"] = set()
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
if TESSERACT_AVAILABLE:
try:
start = time.time()
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
duration = time.time() - start
tess_vocab = tess_result.get("vocabulary", [])
tess_words = tess_result.get("words", [])
# Store Tesseract words in session for later use (grid analysis, position matching)
session["tesseract_words"] = tess_words
session["tesseract_image_width"] = tess_result.get("image_width", 0)
session["tesseract_image_height"] = tess_result.get("image_height", 0)
session[f"tesseract_page_{page_number}"] = tess_result
vocab_list_tess = []
for v in tess_vocab:
vocab_list_tess.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr (eng+deu)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_tess),
"vocabulary": vocab_list_tess,
"confidence": 0.7 if tess_vocab else 0,
"success": len(vocab_list_tess) > 0,
"error": tess_result.get("error"),
"word_count": tess_result.get("word_count", 0),
"columns_detected": len(tess_result.get("columns", [])),
}
all_vocab_sets["tesseract"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_tess if v["english"] and v["german"]
}
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
llm_vocab_with_bbox = match_positions_to_vocab(
tess_words,
methods_results["vision_llm"]["vocabulary"],
tess_result.get("image_width", 1),
tess_result.get("image_height", 1),
)
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
except Exception as e:
logger.error(f"Tesseract failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["tesseract"] = {
"name": "Tesseract OCR",
"model": "tesseract-ocr",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["tesseract"] = set()
# --- Method: CV Pipeline (Document Reconstruction) ---
if CV_PIPELINE_AVAILABLE:
try:
start = time.time()
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
duration = time.time() - start
cv_vocab = cv_result.vocabulary if not cv_result.error else []
vocab_list_cv = []
for v in cv_vocab:
vocab_list_cv.append({
"english": v.get("english", ""),
"german": v.get("german", ""),
"example": v.get("example", ""),
})
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": round(duration, 1),
"vocabulary_count": len(vocab_list_cv),
"vocabulary": vocab_list_cv,
"confidence": 0.8 if cv_vocab else 0,
"success": len(vocab_list_cv) > 0,
"error": cv_result.error,
"word_count": cv_result.word_count,
"columns_detected": cv_result.columns_detected,
"stages": cv_result.stages,
}
all_vocab_sets["cv_pipeline"] = {
(v["english"].lower().strip(), v["german"].lower().strip())
for v in vocab_list_cv if v["english"] and v["german"]
}
except Exception as e:
logger.error(f"CV Pipeline failed: {e}")
import traceback
logger.debug(traceback.format_exc())
methods_results["cv_pipeline"] = {
"name": "CV Pipeline (Document Reconstruction)",
"model": "opencv + tesseract (multi-pass)",
"duration_seconds": 0,
"vocabulary_count": 0,
"vocabulary": [],
"confidence": 0,
"success": False,
"error": str(e),
}
all_vocab_sets["cv_pipeline"] = set()
# --- Build comparison ---
all_unique = set()
for vs in all_vocab_sets.values():
all_unique |= vs
found_by_all = []
found_by_some = []
for english, german in sorted(all_unique):
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
entry = {"english": english, "german": german, "methods": found_in}
if len(found_in) == len(all_vocab_sets):
found_by_all.append(entry)
else:
found_by_some.append(entry)
total_methods = max(len(all_vocab_sets), 1)
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
# Find best method
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
return {
"session_id": session_id,
"page_number": page_number,
"methods": methods_results,
"comparison": {
"found_by_all_methods": found_by_all,
"found_by_some_methods": found_by_some,
"total_unique_vocabulary": len(all_unique),
"agreement_rate": agreement_rate,
},
"recommendation": {
"best_method": best_method,
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
},
}
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
"""
Analyze the grid/table structure of a vocabulary page.
Hybrid approach:
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
real spatial positions via GridDetectionService.
2. Otherwise fall back to Vision LLM for grid structure detection.
page_number is 0-indexed.
Returns GridData structure expected by the frontend GridOverlay component.
"""
import httpx
import time
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number.")
# Convert page to image
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
tess_page_data = session.get(f"tesseract_page_{page_number}")
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
try:
# Run Tesseract if not already cached
if not tess_page_data:
logger.info("Running Tesseract for grid analysis (not cached)")
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
tess_page_data = await _run_tess(image_data, lang="eng+deu")
session[f"tesseract_page_{page_number}"] = tess_page_data
session["tesseract_words"] = tess_page_data.get("words", [])
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
tess_words = tess_page_data.get("words", [])
img_w = tess_page_data.get("image_width", 0)
img_h = tess_page_data.get("image_height", 0)
if tess_words and img_w > 0 and img_h > 0:
service = GridDetectionService()
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
if regions:
grid_result = service.detect_grid(regions)
grid_dict = grid_result.to_dict()
# Merge LLM text if available (better quality than Tesseract text)
# The LLM vocab was stored during compare-ocr
grid_dict["source"] = "tesseract+grid_service"
grid_dict["word_count"] = len(tess_words)
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
f"{grid_result.stats.get('recognized', 0)} recognized")
return {"success": True, "grid": grid_dict}
logger.info("Tesseract data insufficient, falling back to LLM")
except Exception as e:
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
import traceback
logger.debug(traceback.format_exc())
# --- Strategy 2: Fall back to Vision LLM ---
image_base64 = base64.b64encode(image_data).decode("utf-8")
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
Return a JSON object with this EXACT structure:
{
"rows": <number of rows>,
"columns": <number of columns>,
"column_types": ["english", "german", "example"],
"entries": [
{
"row": 0,
"col": 0,
"text": "the word or phrase in this cell",
"column_type": "english",
"confidence": 0.95
}
]
}
Rules:
- row and col are 0-indexed
- column_type is one of: "english", "german", "example", "unknown"
- Detect whether each column contains English words, German translations, or example sentences
- Include ALL non-empty cells
- confidence is 0.0-1.0 based on how clear the text is
- If a cell is empty, don't include it
- Return ONLY the JSON, no other text"""
try:
import asyncio
raw_text = ""
max_retries = 3
for attempt in range(max_retries):
async with httpx.AsyncClient(timeout=300.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json={
"model": VISION_MODEL,
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 8192},
},
timeout=300.0,
)
if response.status_code == 500 and attempt < max_retries - 1:
wait_time = 10 * (attempt + 1)
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
await asyncio.sleep(wait_time)
continue
elif response.status_code != 200:
error_detail = response.text[:200] if response.text else "Unknown error"
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
raw_text = response.json().get("message", {}).get("content", "")
break
# Parse JSON from response
import re
json_match = re.search(r'\{[\s\S]*\}', raw_text)
if not json_match:
return {"success": False, "error": "Could not parse grid structure from LLM response"}
grid_raw = json.loads(json_match.group())
num_rows = grid_raw.get("rows", 0)
num_cols = grid_raw.get("columns", 0)
column_types = grid_raw.get("column_types", [])
entries = grid_raw.get("entries", [])
if num_rows == 0 or num_cols == 0:
return {"success": False, "error": "No grid structure detected"}
# Ensure column_types has the right length
while len(column_types) < num_cols:
column_types.append("unknown")
# Build cell grid with percentage-based coordinates
row_height = 100.0 / num_rows
col_width = 100.0 / num_cols
# Track which cells have content
cell_map = {}
for entry in entries:
r = entry.get("row", 0)
c = entry.get("col", 0)
cell_map[(r, c)] = entry
cells = []
recognized_count = 0
empty_count = 0
problematic_count = 0
for r in range(num_rows):
row_cells = []
for c in range(num_cols):
x = c * col_width
y = r * row_height
if (r, c) in cell_map:
entry = cell_map[(r, c)]
text = entry.get("text", "").strip()
conf = entry.get("confidence", 0.8)
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
if text:
status = "recognized" if conf >= 0.5 else "problematic"
if status == "recognized":
recognized_count += 1
else:
problematic_count += 1
else:
status = "empty"
empty_count += 1
else:
text = ""
conf = 0.0
col_type = column_types[c] if c < len(column_types) else "unknown"
status = "empty"
empty_count += 1
row_cells.append({
"row": r,
"col": c,
"x": round(x, 2),
"y": round(y, 2),
"width": round(col_width, 2),
"height": round(row_height, 2),
"text": text,
"confidence": conf,
"status": status,
"column_type": col_type,
})
cells.append(row_cells)
total = num_rows * num_cols
coverage = (recognized_count + problematic_count) / max(total, 1)
# Column and row boundaries as percentages
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
grid_data = {
"rows": num_rows,
"columns": num_cols,
"cells": cells,
"column_types": column_types,
"column_boundaries": col_boundaries,
"row_boundaries": row_boundaries,
"deskew_angle": 0.0,
"source": "vision_llm",
"stats": {
"recognized": recognized_count,
"problematic": problematic_count,
"empty": empty_count,
"manual": 0,
"total": total,
"coverage": round(coverage, 3),
},
}
return {"success": True, "grid": grid_data}
except httpx.TimeoutException:
logger.error("Grid analysis timed out")
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
except Exception as e:
logger.error(f"Grid analysis failed: {e}")
import traceback
logger.debug(traceback.format_exc())
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}

View File

@@ -0,0 +1,325 @@
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
Contains:
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
- _get_demo_vocabulary(): Demo data for testing
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
"""
import base64
import json
import logging
import os
import re
import uuid
from typing import List
import httpx
from vocab_worksheet_models import VocabularyEntry
logger = logging.getLogger(__name__)
# Ollama Configuration
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
# =============================================================================
# Vision LLM Vocabulary Extraction
# =============================================================================
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
{
"vocabulary": [
{
"english": "to improve",
"german": "verbessern",
"example": "I want to improve my English."
}
]
}
REGELN:
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
2. Behalte die exakte Schreibweise bei
3. Bei fehlenden Beispielsaetzen: "example": null
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
5. Gib NUR valides JSON zurueck, keine Erklaerungen
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
Beispiel-Output:
{
"vocabulary": [
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
]
}"""
async def extract_vocabulary_from_image(
image_data: bytes,
filename: str,
page_number: int = 0,
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
) -> tuple[List[VocabularyEntry], float, str]:
"""
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
Args:
image_data: Image bytes
filename: Original filename for logging
page_number: 0-indexed page number for error messages
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
If False, use Vision LLM (slower, better for complex layouts)
Returns:
Tuple of (vocabulary_entries, confidence, error_message)
error_message is empty string on success
"""
# ==========================================================================
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
# ==========================================================================
if use_hybrid:
try:
from hybrid_vocab_extractor import extract_vocabulary_hybrid
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
if error:
logger.warning(f"Hybrid extraction had issues: {error}")
# Fall through to Vision LLM fallback
elif vocab_dicts:
# Convert dicts to VocabularyEntry objects
vocabulary = [
VocabularyEntry(
id=str(uuid.uuid4()),
english=v.get("english", ""),
german=v.get("german", ""),
example_sentence=v.get("example"),
source_page=page_number + 1
)
for v in vocab_dicts
if v.get("english") and v.get("german")
]
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
return vocabulary, confidence, ""
except ImportError as e:
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
except Exception as e:
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
import traceback
logger.debug(traceback.format_exc())
# ==========================================================================
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
# ==========================================================================
logger.info(f"Using VISION LLM extraction for {filename}")
try:
# First check if Ollama is available
async with httpx.AsyncClient(timeout=10.0) as check_client:
try:
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
if health_response.status_code != 200:
logger.error(f"Ollama not available at {OLLAMA_URL}")
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
except Exception as e:
logger.error(f"Ollama health check failed: {e}")
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
image_base64 = base64.b64encode(image_data).decode("utf-8")
payload = {
"model": VISION_MODEL,
"messages": [
{
"role": "user",
"content": VOCAB_EXTRACTION_PROMPT,
"images": [image_base64]
}
],
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 4096,
}
}
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
# Increased timeout for Vision models (they can be slow)
async with httpx.AsyncClient(timeout=600.0) as client:
response = await client.post(
f"{OLLAMA_URL}/api/chat",
json=payload,
timeout=300.0 # 5 minutes per page
)
response.raise_for_status()
data = response.json()
extracted_text = data.get("message", {}).get("content", "")
logger.info(f"Ollama response received: {len(extracted_text)} chars")
# Parse JSON from response
vocabulary = parse_vocabulary_json(extracted_text)
# Set source_page for each entry
for v in vocabulary:
v.source_page = page_number + 1
# Estimate confidence
confidence = 0.85 if len(vocabulary) > 0 else 0.1
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
return vocabulary, confidence, ""
except httpx.TimeoutException:
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
except Exception as e:
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
import traceback
logger.error(traceback.format_exc())
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
def _get_demo_vocabulary() -> List[VocabularyEntry]:
"""Return demo vocabulary for testing when Vision LLM is not available."""
demo_entries = [
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
]
return [
VocabularyEntry(
id=str(uuid.uuid4()),
english=e["english"],
german=e["german"],
example_sentence=e.get("example"),
)
for e in demo_entries
]
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
"""Parse vocabulary JSON from LLM response with robust error handling."""
def clean_json_string(s: str) -> str:
"""Clean a JSON string by removing control characters and fixing common issues."""
# Remove control characters except newlines and tabs
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
# Replace unescaped newlines within strings with space
# This is a simplistic approach - replace actual newlines with escaped ones
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
return s
def try_parse_json(json_str: str) -> dict:
"""Try multiple strategies to parse JSON."""
# Strategy 1: Direct parse
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
# Strategy 2: Clean and parse
try:
cleaned = clean_json_string(json_str)
return json.loads(cleaned)
except json.JSONDecodeError:
pass
# Strategy 3: Try to fix common issues
try:
# Remove trailing commas before } or ]
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
# Fix unquoted keys
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
return json.loads(fixed)
except json.JSONDecodeError:
pass
return None
try:
# Find JSON in response (may have extra text)
start = text.find('{')
end = text.rfind('}') + 1
if start == -1 or end == 0:
logger.warning("No JSON found in response")
return []
json_str = text[start:end]
data = try_parse_json(json_str)
if data is None:
# Strategy 4: Extract vocabulary entries using regex as fallback
logger.warning("JSON parsing failed, trying regex extraction")
vocabulary = []
# Match patterns like {"english": "...", "german": "...", ...}
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
for match in matches:
english = match[0].strip() if match[0] else ""
german = match[1].strip() if match[1] else ""
example = match[2].strip() if len(match) > 2 and match[2] else None
if english and german:
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=example,
)
vocabulary.append(vocab_entry)
if vocabulary:
logger.info(f"Regex extraction found {len(vocabulary)} entries")
return vocabulary
# Normal JSON parsing succeeded
vocabulary = []
for i, entry in enumerate(data.get("vocabulary", [])):
english = entry.get("english", "").strip()
german = entry.get("german", "").strip()
# Skip entries that look like hallucinations (very long or containing unusual patterns)
if len(english) > 100 or len(german) > 200:
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
continue
if not english or not german:
continue
vocab_entry = VocabularyEntry(
id=str(uuid.uuid4()),
english=english,
german=german,
example_sentence=entry.get("example"),
word_type=entry.get("word_type"),
)
vocabulary.append(vocab_entry)
return vocabulary
except Exception as e:
logger.error(f"Failed to parse vocabulary JSON: {e}")
import traceback
logger.error(traceback.format_exc())
return []

View File

@@ -0,0 +1,260 @@
"""
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Functions:
- generate_worksheet_html(): Build HTML for various worksheet types
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
- convert_pdf_page_to_image(): Render single PDF page to PNG
- convert_pdf_to_images(): Render multiple PDF pages to PNG
"""
import io
import logging
import os
from typing import List, Optional
from fastapi import HTTPException
from vocab_worksheet_models import VocabularyEntry, WorksheetType
logger = logging.getLogger(__name__)
# Optional dependency: WeasyPrint
try:
from weasyprint import HTML as _WeasyHTML
WEASYPRINT_AVAILABLE = True
except (ImportError, OSError):
WEASYPRINT_AVAILABLE = False
logger.warning("WeasyPrint not available")
# Optional dependency: PyMuPDF
try:
import fitz # PyMuPDF
FITZ_AVAILABLE = True
except ImportError:
FITZ_AVAILABLE = False
logger.warning("PyMuPDF (fitz) not available")
# =============================================================================
# Worksheet HTML Generation
# =============================================================================
def generate_worksheet_html(
vocabulary: List[VocabularyEntry],
worksheet_type: WorksheetType,
title: str,
show_solutions: bool = False,
repetitions: int = 3,
line_height: str = "normal"
) -> str:
"""Generate HTML for a worksheet."""
# Line height CSS
line_heights = {
"normal": "2.5em",
"large": "3.5em",
"extra-large": "4.5em"
}
lh = line_heights.get(line_height, "2.5em")
html = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
@page {{ size: A4; margin: 2cm; }}
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
h1 {{ font-size: 24px; margin-bottom: 10px; }}
.meta {{ color: #666; margin-bottom: 20px; }}
.name-line {{ margin-bottom: 30px; }}
.vocab-table {{ width: 100%; border-collapse: collapse; }}
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
.vocab-word {{ width: 40%; font-weight: 500; }}
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
.vocab-answer {{ width: 60%; color: #2563eb; }}
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
.section {{ margin-top: 30px; }}
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
</style>
</head>
<body>
<h1>{title}</h1>
<div class="name-line">Name: _________________________ Datum: _____________</div>
"""
if worksheet_type == WorksheetType.EN_TO_DE:
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
if show_solutions:
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
else:
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.DE_TO_EN:
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
if show_solutions:
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
else:
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.COPY_PRACTICE:
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
html += '<table class="vocab-table">'
for entry in vocabulary:
html += f'<tr><td class="vocab-word">{entry.english}</td>'
html += '<td class="vocab-blank">'
if show_solutions:
html += f' {entry.english} ' * repetitions
html += '</td></tr>'
html += '</table></div>'
elif worksheet_type == WorksheetType.GAP_FILL:
entries_with_examples = [e for e in vocabulary if e.example_sentence]
if entries_with_examples:
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
for i, entry in enumerate(entries_with_examples, 1):
# Create gap sentence by removing the English word
gap_sentence = entry.example_sentence
for word in entry.english.split():
if word.lower() in gap_sentence.lower():
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
break
html += f'<p>{i}. {gap_sentence}</p>'
if show_solutions:
html += f'<p class="hint">Loesung: {entry.english}</p>'
else:
html += f'<p class="hint">({entry.german})</p>'
html += '</div>'
html += '</body></html>'
return html
# =============================================================================
# Worksheet PDF Generation
# =============================================================================
async def generate_worksheet_pdf(html: str) -> bytes:
"""Generate PDF from HTML using WeasyPrint."""
try:
from weasyprint import HTML
pdf_bytes = HTML(string=html).write_pdf()
return pdf_bytes
except ImportError:
logger.warning("WeasyPrint not available, returning HTML")
return html.encode('utf-8')
except Exception as e:
logger.error(f"PDF generation failed: {e}")
raise
# =============================================================================
# PDF Utilities (PyMuPDF)
# =============================================================================
def get_pdf_page_count(pdf_data: bytes) -> int:
"""Get the number of pages in a PDF."""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
count = pdf_document.page_count
pdf_document.close()
return count
except Exception as e:
logger.error(f"Failed to get PDF page count: {e}")
return 0
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
"""Convert a specific page of PDF to PNG image using PyMuPDF.
Args:
pdf_data: PDF file as bytes
page_number: 0-indexed page number
thumbnail: If True, return a smaller thumbnail image
"""
try:
import fitz # PyMuPDF
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
if page_number >= pdf_document.page_count:
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
page = pdf_document[page_number]
# Render page to image
# For thumbnails: lower resolution, for OCR: higher resolution
zoom = 0.5 if thumbnail else 2.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
return png_data
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
"""Convert multiple pages of PDF to PNG images.
Args:
pdf_data: PDF file as bytes
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
"""
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
if pdf_document.page_count == 0:
raise ValueError("PDF has no pages")
# If no pages specified, convert all
if pages is None:
pages = list(range(pdf_document.page_count))
images = []
zoom = 2.0
mat = fitz.Matrix(zoom, zoom)
for page_num in pages:
if page_num < pdf_document.page_count:
page = pdf_document[page_num]
pix = page.get_pixmap(matrix=mat)
images.append(pix.tobytes("png"))
pdf_document.close()
logger.info(f"Converted {len(images)} PDF pages to images")
return images
except ImportError:
logger.error("PyMuPDF (fitz) not installed")
raise HTTPException(status_code=500, detail="PDF conversion not available")
except Exception as e:
logger.error(f"PDF conversion failed: {e}")
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")

View File

@@ -0,0 +1,86 @@
"""Pydantic models and enums for the Vocab Worksheet API."""
from datetime import datetime
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel
# =============================================================================
# Enums
# =============================================================================
class WorksheetType(str, Enum):
EN_TO_DE = "en_to_de" # English -> German translation
DE_TO_EN = "de_to_en" # German -> English translation
COPY_PRACTICE = "copy" # Write word multiple times
GAP_FILL = "gap_fill" # Fill in the blanks
COMBINED = "combined" # All types combined
class SessionStatus(str, Enum):
PENDING = "pending" # Session created, no upload yet
PROCESSING = "processing" # OCR in progress
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
COMPLETED = "completed" # Worksheet generated
# =============================================================================
# Pydantic Models
# =============================================================================
class VocabularyEntry(BaseModel):
id: str
english: str
german: str
example_sentence: Optional[str] = None
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
word_type: Optional[str] = None # noun, verb, adjective, etc.
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
class SessionCreate(BaseModel):
name: str
description: Optional[str] = None
source_language: str = "en" # Source language (default English)
target_language: str = "de" # Target language (default German)
class SessionResponse(BaseModel):
id: str
name: str
description: Optional[str]
source_language: str
target_language: str
status: str
vocabulary_count: int
image_path: Optional[str]
created_at: datetime
class VocabularyResponse(BaseModel):
session_id: str
vocabulary: List[VocabularyEntry]
extraction_confidence: Optional[float]
class VocabularyUpdate(BaseModel):
vocabulary: List[VocabularyEntry]
class WorksheetGenerateRequest(BaseModel):
worksheet_types: List[WorksheetType]
title: Optional[str] = None
include_solutions: bool = True
repetitions: int = 3 # For copy practice
line_height: str = "normal" # normal, large, extra-large
class WorksheetResponse(BaseModel):
id: str
session_id: str
worksheet_types: List[str]
pdf_path: str
solution_path: Optional[str]
generated_at: datetime

View File

@@ -0,0 +1,481 @@
"""
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
Extracted from vocab_worksheet_api.py to keep file sizes manageable.
Pipeline steps:
orientation → deskew → dewarp → crop → scan-quality → enhance →
dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
vocab extraction → row merging
"""
import logging
import uuid
from typing import Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Optional heavy dependencies (not available in every environment)
# ---------------------------------------------------------------------------
try:
import cv2
import numpy as np
except ImportError:
cv2 = None # type: ignore[assignment]
np = None # type: ignore[assignment]
logger.warning("cv2 / numpy not available — OCR pipeline disabled")
try:
from PIL import Image
except ImportError:
Image = None # type: ignore[assignment]
try:
import pytesseract
except ImportError:
pytesseract = None # type: ignore[assignment]
# CV pipeline helpers
try:
from cv_vocab_pipeline import (
deskew_two_pass,
dewarp_image,
detect_and_fix_orientation,
_cells_to_vocab_entries,
_fix_phonetic_brackets,
)
except ImportError:
deskew_two_pass = None # type: ignore[assignment]
dewarp_image = None # type: ignore[assignment]
detect_and_fix_orientation = None # type: ignore[assignment]
_cells_to_vocab_entries = None # type: ignore[assignment]
_fix_phonetic_brackets = None # type: ignore[assignment]
try:
from cv_cell_grid import (
_merge_wrapped_rows,
_merge_phonetic_continuation_rows,
_merge_continuation_rows,
)
except ImportError:
_merge_wrapped_rows = None # type: ignore[assignment]
_merge_phonetic_continuation_rows = None # type: ignore[assignment]
_merge_continuation_rows = None # type: ignore[assignment]
try:
from cv_ocr_engines import ocr_region_rapid
except ImportError:
ocr_region_rapid = None # type: ignore[assignment]
try:
from cv_vocab_types import PageRegion
except ImportError:
PageRegion = None # type: ignore[assignment]
try:
from ocr_pipeline_ocr_merge import (
_split_paddle_multi_words,
_merge_paddle_tesseract,
_deduplicate_words,
)
except ImportError:
_split_paddle_multi_words = None # type: ignore[assignment]
_merge_paddle_tesseract = None # type: ignore[assignment]
_deduplicate_words = None # type: ignore[assignment]
try:
from cv_words_first import build_grid_from_words
except ImportError:
build_grid_from_words = None # type: ignore[assignment]
try:
from ocr_pipeline_session_store import (
create_session_db as create_pipeline_session_db,
update_session_db as update_pipeline_session_db,
)
except ImportError:
create_pipeline_session_db = None # type: ignore[assignment]
update_pipeline_session_db = None # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Main pipeline function
# ---------------------------------------------------------------------------
async def _run_ocr_pipeline_for_page(
img_bgr: "np.ndarray",
page_number: int,
vocab_session_id: str,
*,
ipa_mode: str = "none",
syllable_mode: str = "none",
enable_enhance: bool = True,
max_columns: Optional[int] = 3,
override_min_conf: Optional[int] = None,
) -> tuple:
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
Uses the same pipeline as the admin OCR Kombi pipeline:
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
Args:
img_bgr: BGR numpy array.
page_number: 0-indexed page number.
vocab_session_id: Vocab session ID for logging.
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
Returns (entries, rotation_deg) where entries is a list of dicts and
rotation_deg is the orientation correction applied (0, 90, 180, 270).
"""
import time as _time
t_total = _time.time()
img_h, img_w = img_bgr.shape[:2]
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
# 1. Orientation detection (fix upside-down scans)
t0 = _time.time()
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
img_h, img_w = img_bgr.shape[:2]
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
else:
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
# 2. Create pipeline session in DB (visible in admin Kombi UI)
pipeline_session_id = str(uuid.uuid4())
try:
_, png_buf = cv2.imencode(".png", img_bgr)
original_png = png_buf.tobytes()
await create_pipeline_session_db(
pipeline_session_id,
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
filename=f"page_{page_number + 1}.png",
original_png=original_png,
)
except Exception as e:
logger.warning(f"Could not create pipeline session in DB: {e}")
# 3. Three-pass deskew
t0 = _time.time()
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
# 4. Dewarp
t0 = _time.time()
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
# 5. Content crop (removes scanner borders, gutter shadows)
t0 = _time.time()
try:
from page_crop import detect_and_crop_page
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
if crop_result.get("crop_applied"):
dewarped_bgr = cropped_bgr
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
else:
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
except Exception as e:
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
# 5b. Scan quality assessment
scan_quality_report = None
try:
from scan_quality import score_scan_quality
scan_quality_report = score_scan_quality(dewarped_bgr)
except Exception as e:
logger.warning(f" scan quality: failed ({e})")
if override_min_conf:
min_ocr_conf = override_min_conf
else:
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
# 5c. Image enhancement for degraded scans
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
if is_degraded and enable_enhance:
try:
from ocr_image_enhance import enhance_for_ocr
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
logger.info(" enhancement: applied (degraded scan)")
except Exception as e:
logger.warning(f" enhancement: failed ({e})")
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
t0 = _time.time()
img_h, img_w = dewarped_bgr.shape[:2]
# RapidOCR (local ONNX)
try:
from cv_ocr_engines import ocr_region_rapid
from cv_vocab_types import PageRegion
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
except Exception as e:
logger.warning(f" RapidOCR failed: {e}")
rapid_words = []
# Tesseract
from PIL import Image
import pytesseract
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
data = pytesseract.image_to_data(
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
output_type=pytesseract.Output.DICT,
)
tess_words = []
for i in range(len(data["text"])):
text = str(data["text"][i]).strip()
conf_raw = str(data["conf"][i])
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
if not text or conf < min_ocr_conf:
continue
tess_words.append({
"text": text,
"left": data["left"][i], "top": data["top"][i],
"width": data["width"][i], "height": data["height"][i],
"conf": conf,
})
# Merge dual-engine results
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
from cv_words_first import build_grid_from_words
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
if rapid_split or tess_words:
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
merged_words = _deduplicate_words(merged_words)
else:
merged_words = tess_words # fallback to Tesseract only
# Build initial grid from merged words
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
for cell in cells:
cell["ocr_engine"] = "rapid_kombi"
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
n_cols = len(columns_meta)
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
# 7. Save word_result to pipeline session (needed by _build_grid_core)
word_result = {
"cells": cells,
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
"columns_used": columns_meta,
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
"image_width": img_w,
"image_height": img_h,
"duration_seconds": 0,
"ocr_engine": "rapid_kombi",
"raw_tesseract_words": tess_words,
"summary": {
"total_cells": len(cells),
"non_empty_cells": sum(1 for c in cells if c.get("text")),
},
}
# Save images + word_result to pipeline session for admin visibility
try:
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
await update_pipeline_session_db(
pipeline_session_id,
deskewed_png=dsk_buf.tobytes(),
dewarped_png=dwp_buf.tobytes(),
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
word_result=word_result,
deskew_result={"angle_applied": round(angle_applied, 3)},
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
current_step=8,
)
except Exception as e:
logger.warning(f"Could not update pipeline session: {e}")
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
t0 = _time.time()
try:
from grid_editor_api import _build_grid_core
session_data = {
"word_result": word_result,
}
grid_result = await _build_grid_core(
pipeline_session_id, session_data,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
)
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
f"({_time.time() - t0:.1f}s)")
# Save grid result to pipeline session
try:
await update_pipeline_session_db(
pipeline_session_id,
grid_editor_result=grid_result,
current_step=11,
)
except Exception:
pass
except Exception as e:
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
grid_result = None
# 9. Extract vocab entries
# Prefer grid-build result (better column detection, more cells) over
# the initial build_grid_from_words() which often under-clusters.
page_vocabulary = []
extraction_source = "none"
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
if grid_result and grid_result.get("zones"):
for zone in grid_result["zones"]:
zone_cols = zone.get("columns", [])
zone_cells = zone.get("cells", [])
if not zone_cols or not zone_cells:
continue
# Sort columns by x position to determine roles
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
col_idx_to_pos = {}
for pos, col in enumerate(sorted_cols):
ci = col.get("col_index", col.get("index", -1))
col_idx_to_pos[ci] = pos
# Skip zones with only 1 column (likely headers/boxes)
if len(sorted_cols) < 2:
continue
# Group cells by row
rows_map: dict = {}
for cell in zone_cells:
ri = cell.get("row_index", 0)
if ri not in rows_map:
rows_map[ri] = {}
ci = cell.get("col_index", 0)
rows_map[ri][ci] = (cell.get("text") or "").strip()
n_cols = len(sorted_cols)
for ri in sorted(rows_map.keys()):
row = rows_map[ri]
# Collect texts in column-position order
texts = []
for col in sorted_cols:
ci = col.get("col_index", col.get("index", -1))
texts.append(row.get(ci, ""))
if not any(texts):
continue
# Map by position, skipping narrow first column (page refs/markers)
# Heuristic: if first column is very narrow (<15% of zone width),
# it's likely a marker/ref column — skip it for vocab
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
data_texts = texts[1:] if skip_first else texts
entry = {
"id": str(uuid.uuid4()),
"english": data_texts[0] if len(data_texts) > 0 else "",
"german": data_texts[1] if len(data_texts) > 1 else "",
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
"source_page": page_number + 1,
}
if entry["english"] or entry["german"]:
page_vocabulary.append(entry)
if page_vocabulary:
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
# B) Fallback: original cells with column classification
if not page_vocabulary:
col_types = {c.get("type") for c in columns_meta}
is_vocab = bool(col_types & {"column_en", "column_de"})
if is_vocab:
entries = _cells_to_vocab_entries(cells, columns_meta)
entries = _fix_phonetic_brackets(entries, pronunciation="british")
for entry in entries:
if not entry.get("english") and not entry.get("german"):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": entry.get("english", ""),
"german": entry.get("german", ""),
"example_sentence": entry.get("example", ""),
"source_page": page_number + 1,
})
extraction_source = f"classified ({len(columns_meta)} cols)"
else:
# Last resort: all cells by position
rows_map2: dict = {}
for cell in cells:
ri = cell.get("row_index", 0)
if ri not in rows_map2:
rows_map2[ri] = {}
ci = cell.get("col_index", 0)
rows_map2[ri][ci] = (cell.get("text") or "").strip()
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
for ri in sorted(rows_map2.keys()):
row = rows_map2[ri]
texts = [row.get(ci, "") for ci in all_ci]
if not any(texts):
continue
page_vocabulary.append({
"id": str(uuid.uuid4()),
"english": texts[0] if len(texts) > 0 else "",
"german": texts[1] if len(texts) > 1 else "",
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
"source_page": page_number + 1,
})
extraction_source = f"generic ({len(all_ci)} cols)"
# --- Post-processing: merge cell-wrap continuation rows ---
if len(page_vocabulary) >= 2:
try:
# Convert to internal format (example_sentence → example)
internal = []
for v in page_vocabulary:
internal.append({
'row_index': len(internal),
'english': v.get('english', ''),
'german': v.get('german', ''),
'example': v.get('example_sentence', ''),
})
n_before = len(internal)
internal = _merge_wrapped_rows(internal)
internal = _merge_phonetic_continuation_rows(internal)
internal = _merge_continuation_rows(internal)
if len(internal) < n_before:
# Rebuild page_vocabulary from merged entries
merged_vocab = []
for entry in internal:
if not entry.get('english') and not entry.get('german'):
continue
merged_vocab.append({
'id': str(uuid.uuid4()),
'english': entry.get('english', ''),
'german': entry.get('german', ''),
'example_sentence': entry.get('example', ''),
'source_page': page_number + 1,
})
logger.info(f" row merging: {n_before}{len(merged_vocab)} entries")
page_vocabulary = merged_vocab
except Exception as e:
logger.warning(f" row merging failed (non-critical): {e}")
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
total_duration = _time.time() - t_total
logger.info(f"Kombi Pipeline page {page_number + 1}: "
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
return page_vocabulary, rotation, scan_quality_report

View File

@@ -0,0 +1,490 @@
"""
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
Routes (no prefix — included into the main /api/v1/vocab router):
POST /sessions/{session_id}/upload-pdf-info
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
GET /sessions/{session_id}/pdf-page-image/{page_number}
POST /sessions/{session_id}/process-single-page/{page_number}
POST /sessions/{session_id}/process-pages
"""
import io
import logging
import os
import uuid
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
from fastapi.responses import StreamingResponse
from vocab_worksheet_models import SessionStatus, VocabularyEntry
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Local storage path
# ---------------------------------------------------------------------------
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
# ---------------------------------------------------------------------------
# Optional heavy dependencies
# ---------------------------------------------------------------------------
try:
import numpy as np
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
OCR_PIPELINE_AVAILABLE = True
except ImportError:
np = None # type: ignore[assignment]
OCR_PIPELINE_AVAILABLE = False
logger.warning("OCR pipeline imports not available in upload module")
# Sub-module imports (already split out)
from vocab_worksheet_generation import (
convert_pdf_page_to_image,
convert_pdf_to_images,
get_pdf_page_count,
)
from vocab_worksheet_extraction import extract_vocabulary_from_image
try:
from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
except ImportError:
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
# ---------------------------------------------------------------------------
# In-memory session store (shared with main module)
# ---------------------------------------------------------------------------
def _get_sessions():
from vocab_worksheet_api import _sessions
return _sessions
# ---------------------------------------------------------------------------
# Router (no prefix — will be included into the main vocab router)
# ---------------------------------------------------------------------------
upload_router = APIRouter()
# =============================================================================
# POST /sessions/{session_id}/upload-pdf-info
# =============================================================================
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
async def upload_pdf_get_info(
session_id: str,
file: UploadFile = File(...),
):
"""
Upload a PDF and get page count and thumbnails for preview.
Use this before processing to let user select pages.
"""
logger.info(f"PDF info request for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
# Validate file type
extension = file.filename.split('.')[-1].lower() if file.filename else ''
content_type = file.content_type or ''
if extension != 'pdf' and content_type != 'application/pdf':
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
content = await file.read()
# Save PDF temporarily
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
os.makedirs(session_dir, exist_ok=True)
pdf_path = os.path.join(session_dir, "source.pdf")
with open(pdf_path, 'wb') as f:
f.write(content)
# Get page count
page_count = get_pdf_page_count(content)
# Store PDF data in session for later processing
session["pdf_data"] = content
session["pdf_path"] = pdf_path
session["pdf_page_count"] = page_count
session["status"] = "pdf_uploaded"
# Detect orientation for each page so thumbnails are shown correctly
page_rotations: dict = {}
if OCR_PIPELINE_AVAILABLE:
for pg in range(page_count):
try:
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
_, rotation = detect_and_fix_orientation(img_bgr)
if rotation:
page_rotations[pg] = rotation
logger.info(f"Page {pg + 1}: orientation {rotation}°")
except Exception as e:
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
session["page_rotations"] = page_rotations
return {
"session_id": session_id,
"page_count": page_count,
"filename": file.filename,
"page_rotations": page_rotations,
}
# =============================================================================
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
"""Get a thumbnail image of a specific PDF page.
Uses fitz for rendering so that page_rotations (from OCR orientation
detection) are applied consistently.
Args:
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
try:
import fitz
zoom = 2.0 if hires else 0.5
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
except Exception as e:
logger.error(f"PDF thumbnail failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# GET /sessions/{session_id}/pdf-page-image/{page_number}
# =============================================================================
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
"""PDF page as PNG at arbitrary resolution (for editor view).
Args:
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
"""
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
try:
import fitz
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
page = pdf_document[page_number]
# Apply orientation correction detected during OCR processing
rot = session.get("page_rotations", {}).get(page_number, 0)
if rot:
page.set_rotation(rot)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
png_data = pix.tobytes("png")
pdf_document.close()
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
except Exception as e:
logger.error(f"PDF page image failed: {e}")
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
return StreamingResponse(
io.BytesIO(png_data),
media_type="image/png",
)
# =============================================================================
# POST /sessions/{session_id}/process-single-page/{page_number}
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
async def process_single_page(
session_id: str,
page_number: int,
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
):
"""
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
Query params:
ipa_mode: "none" (default), "auto", "all", "en", "de"
syllable_mode: "none" (default), "auto", "all", "en", "de"
enhance: true (default) -- apply CLAHE/denoise for degraded scans
max_cols: 3 (default) -- max column count (0=unlimited)
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
The frontend should call this sequentially for each page.
Returns the vocabulary for just this one page.
"""
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
if session_id not in _get_sessions():
raise HTTPException(
status_code=404,
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
)
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
if page_number < 0 or page_number >= page_count:
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
# Derive pipeline-level variable names for the quality report
enable_enhance = enhance
max_columns = max_cols if max_cols > 0 else None
override_min_conf = min_conf if min_conf > 0 else None
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
rotation_deg = 0
quality_report = None
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
try:
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
img_bgr, page_number, session_id,
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
enable_enhance=enable_enhance,
max_columns=max_columns,
override_min_conf=override_min_conf,
)
# Update min_ocr_conf from quality report if available
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
min_ocr_conf = quality_report.recommended_min_conf
except Exception as e:
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": f"OCR pipeline error: {e}",
"vocabulary": [],
"vocabulary_count": 0,
}
else:
# Fallback to LLM vision extraction
logger.warning("OCR pipeline not available, falling back to LLM vision")
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_number + 1}.png",
page_number=page_number
)
if error:
logger.warning(f"Page {page_number + 1} failed: {error}")
return {
"session_id": session_id,
"page_number": page_number + 1,
"success": False,
"error": error,
"vocabulary": [],
"vocabulary_count": 0,
}
page_vocabulary = []
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_number + 1
if 'id' not in entry_dict or not entry_dict['id']:
entry_dict['id'] = str(uuid.uuid4())
page_vocabulary.append(entry_dict)
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
# Store rotation for this page (used by image/thumbnail endpoints)
session.setdefault("page_rotations", {})[page_number] = rotation_deg
# Add to session's vocabulary (append, don't replace)
existing_vocab = session.get("vocabulary", [])
# Remove any existing entries from this page (in case of re-processing)
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
existing_vocab.extend(page_vocabulary)
session["vocabulary"] = existing_vocab
session["vocabulary_count"] = len(existing_vocab)
session["status"] = SessionStatus.EXTRACTED.value
result = {
"session_id": session_id,
"page_number": page_number + 1,
"success": True,
"vocabulary": page_vocabulary,
"vocabulary_count": len(page_vocabulary),
"total_vocabulary_count": len(existing_vocab),
"extraction_confidence": 0.9,
"rotation": rotation_deg,
}
# Add scan quality report + active steps info
if quality_report:
sq = quality_report.to_dict()
sq["active_steps"] = {
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
}
result["scan_quality"] = sq
return result
# =============================================================================
# POST /sessions/{session_id}/process-pages (DEPRECATED)
# =============================================================================
@upload_router.post("/sessions/{session_id}/process-pages")
async def process_pdf_pages(
session_id: str,
pages: List[int] = None,
process_all: bool = False,
):
"""
Process specific pages of an uploaded PDF.
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
Args:
pages: List of 0-indexed page numbers to process
process_all: If True, process all pages
"""
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
if session_id not in _get_sessions():
raise HTTPException(status_code=404, detail="Session not found")
session = _get_sessions()[session_id]
pdf_data = session.get("pdf_data")
if not pdf_data:
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
page_count = session.get("pdf_page_count", 1)
# Determine which pages to process
if process_all:
pages = list(range(page_count))
elif pages is None or len(pages) == 0:
pages = [0] # Default to first page
# Convert selected pages to images
images = await convert_pdf_to_images(pdf_data, pages)
# Extract vocabulary from each page SEQUENTIALLY
all_vocabulary = []
total_confidence = 0.0
successful_pages = []
failed_pages = []
error_messages = []
for i, image_data in enumerate(images):
page_num = pages[i]
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
vocabulary, confidence, error = await extract_vocabulary_from_image(
image_data,
f"page_{page_num + 1}.png",
page_number=page_num
)
if error:
failed_pages.append(page_num + 1)
error_messages.append(error)
logger.warning(f"Page {page_num + 1} failed: {error}")
else:
successful_pages.append(page_num + 1)
total_confidence += confidence
# Add page info to each entry and convert to dict
for entry in vocabulary:
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
entry_dict['source_page'] = page_num + 1
all_vocabulary.append(entry_dict)
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
# Update session
session["vocabulary"] = all_vocabulary
session["vocabulary_count"] = len(all_vocabulary)
session["extraction_confidence"] = avg_confidence
session["processed_pages"] = pages
session["successful_pages"] = successful_pages
session["failed_pages"] = failed_pages
session["status"] = SessionStatus.EXTRACTED.value
# Save first page as preview image
if images:
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
image_path = os.path.join(session_dir, "source.png")
with open(image_path, 'wb') as f:
f.write(images[0])
session["image_path"] = image_path
result = {
"session_id": session_id,
"pages_processed": len(pages),
"pages_successful": len(successful_pages),
"pages_failed": len(failed_pages),
"successful_pages": successful_pages,
"failed_pages": failed_pages,
"vocabulary_count": len(all_vocabulary),
"extraction_confidence": avg_confidence,
"status": SessionStatus.EXTRACTED.value,
}
if error_messages:
result["errors"] = error_messages
return result