Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
257
klausur-service/backend/cv_layout_analyze.py
Normal file
257
klausur-service/backend/cv_layout_analyze.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Legacy layout analysis using projection profiles.
|
||||
|
||||
Extracted from cv_layout_columns.py — contains:
|
||||
- analyze_layout() (projection-profile based column/header/footer detection)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion
|
||||
from cv_layout_detection import _find_content_bounds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
|
||||
"""Detect columns, header, and footer using projection profiles.
|
||||
|
||||
Uses content-bounds detection to exclude page margins before searching
|
||||
for column separators within the actual text area.
|
||||
|
||||
Args:
|
||||
layout_img: CLAHE-enhanced grayscale image.
|
||||
ocr_img: Binarized image for text density analysis.
|
||||
|
||||
Returns:
|
||||
List of PageRegion objects describing detected regions.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# Invert: black text on white → white text on black for projection
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
|
||||
# --- Find actual content bounds (exclude page margins) ---
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
|
||||
|
||||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||
# Fallback if detection seems wrong
|
||||
left_x, right_x = 0, w
|
||||
top_y, bottom_y = 0, h
|
||||
content_w, content_h = w, h
|
||||
|
||||
# --- Vertical projection within content area to find column separators ---
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection profile
|
||||
kernel_size = max(5, content_w // 50)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Debug: log projection profile statistics
|
||||
p_mean = float(np.mean(v_proj_smooth))
|
||||
p_median = float(np.median(v_proj_smooth))
|
||||
p_min = float(np.min(v_proj_smooth))
|
||||
p_max = float(np.max(v_proj_smooth))
|
||||
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
|
||||
f"mean={p_mean:.4f}, median={p_median:.4f}")
|
||||
|
||||
# Find valleys using multiple threshold strategies
|
||||
# Strategy 1: relative to median (catches clear separators)
|
||||
# Strategy 2: local minima approach (catches subtle gaps)
|
||||
threshold = max(p_median * 0.3, p_mean * 0.2)
|
||||
logger.info(f"Layout: valley threshold={threshold:.4f}")
|
||||
|
||||
in_valley = v_proj_smooth < threshold
|
||||
|
||||
# Find contiguous valley regions
|
||||
all_valleys = []
|
||||
start = None
|
||||
for x in range(len(v_proj_smooth)):
|
||||
if in_valley[x] and start is None:
|
||||
start = x
|
||||
elif not in_valley[x] and start is not None:
|
||||
valley_width = x - start
|
||||
valley_depth = float(np.min(v_proj_smooth[start:x]))
|
||||
# Valley must be at least 3px wide
|
||||
if valley_width >= 3:
|
||||
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
|
||||
start = None
|
||||
|
||||
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
|
||||
|
||||
# Filter: valleys must be inside the content area (not at edges)
|
||||
inner_margin = int(content_w * 0.08)
|
||||
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
|
||||
|
||||
# If no valleys found with strict threshold, try local minima approach
|
||||
if len(valleys) < 2:
|
||||
logger.info("Layout: trying local minima approach for column detection")
|
||||
# Divide content into 20 segments, find the 2 lowest
|
||||
seg_count = 20
|
||||
seg_width = content_w // seg_count
|
||||
seg_scores = []
|
||||
for i in range(seg_count):
|
||||
sx = i * seg_width
|
||||
ex = min((i + 1) * seg_width, content_w)
|
||||
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
|
||||
seg_scores.append((i, sx, ex, seg_mean))
|
||||
|
||||
seg_scores.sort(key=lambda s: s[3])
|
||||
logger.info(f"Layout: segment scores (lowest 5): "
|
||||
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
|
||||
|
||||
# Find two lowest non-adjacent segments that create reasonable columns
|
||||
candidate_valleys = []
|
||||
for seg_idx, sx, ex, seg_mean in seg_scores:
|
||||
# Must not be at the edges
|
||||
if seg_idx <= 1 or seg_idx >= seg_count - 2:
|
||||
continue
|
||||
# Must be significantly lower than overall mean
|
||||
if seg_mean < p_mean * 0.6:
|
||||
center = (sx + ex) // 2
|
||||
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
|
||||
|
||||
if len(candidate_valleys) >= 2:
|
||||
# Pick the best pair: non-adjacent, creating reasonable column widths
|
||||
candidate_valleys.sort(key=lambda v: v[2])
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(candidate_valleys)):
|
||||
for j in range(i + 1, len(candidate_valleys)):
|
||||
c1 = candidate_valleys[i][2]
|
||||
c2 = candidate_valleys[j][2]
|
||||
# Must be at least 20% apart
|
||||
if (c2 - c1) < content_w * 0.2:
|
||||
continue
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
# Each column at least 15%
|
||||
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
|
||||
continue
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (candidate_valleys[i], candidate_valleys[j])
|
||||
|
||||
if best_pair:
|
||||
valleys = list(best_pair)
|
||||
logger.info(f"Layout: local minima found 2 valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
logger.info(f"Layout: final {len(valleys)} valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
regions = []
|
||||
|
||||
if len(valleys) >= 2:
|
||||
# 3-column layout detected
|
||||
valleys.sort(key=lambda v: v[2])
|
||||
|
||||
if len(valleys) == 2:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
else:
|
||||
# Pick the two valleys that best divide into 3 parts
|
||||
# Prefer wider valleys (more likely true separators)
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(valleys)):
|
||||
for j in range(i + 1, len(valleys)):
|
||||
c1, c2 = valleys[i][2], valleys[j][2]
|
||||
# Each column should be at least 15% of content width
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
|
||||
continue
|
||||
# Score: lower is better (more even distribution)
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
# Bonus for wider valleys (subtract valley width)
|
||||
score -= (valleys[i][3] + valleys[j][3]) * 0.5
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (c1, c2)
|
||||
if best_pair:
|
||||
sep1_center, sep2_center = best_pair
|
||||
else:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
|
||||
# Convert from content-relative to absolute coordinates
|
||||
abs_sep1 = sep1_center + left_x
|
||||
abs_sep2 = sep2_center + left_x
|
||||
|
||||
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
|
||||
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep1, y=top_y,
|
||||
width=abs_sep2 - abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=abs_sep2, y=top_y,
|
||||
width=w - abs_sep2, height=content_h
|
||||
))
|
||||
|
||||
elif len(valleys) == 1:
|
||||
# 2-column layout
|
||||
abs_sep = valleys[0][2] + left_x
|
||||
|
||||
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep, y=top_y,
|
||||
width=w - abs_sep, height=content_h
|
||||
))
|
||||
|
||||
else:
|
||||
# No columns detected — run full-page OCR as single column
|
||||
logger.warning("Layout: no column separators found, using full page")
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=w, height=content_h
|
||||
))
|
||||
|
||||
# Add header/footer info (gap-based detection with fallback)
|
||||
# Lazy import to avoid circular dependency with cv_layout.py
|
||||
from cv_layout_detection import _add_header_footer
|
||||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||||
|
||||
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||||
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||||
col_count = len([r for r in regions if r.type.startswith('column')])
|
||||
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||||
|
||||
return regions
|
||||
494
klausur-service/backend/cv_layout_classify.py
Normal file
494
klausur-service/backend/cv_layout_classify.py
Normal file
@@ -0,0 +1,494 @@
|
||||
"""
|
||||
Column type classification for OCR layout analysis.
|
||||
|
||||
Entry point: classify_column_types() with 4-level fallback chain.
|
||||
Also provides positional_column_regions() and _build_margin_regions().
|
||||
Position-based classifiers (Level 2+3) in cv_layout_classify_position.py.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry, PageRegion
|
||||
|
||||
from cv_layout_scoring import (
|
||||
_score_language,
|
||||
_score_role,
|
||||
_score_dictionary_signals,
|
||||
_classify_dictionary_columns,
|
||||
)
|
||||
|
||||
from cv_layout_classify_position import (
|
||||
_classify_by_position_enhanced,
|
||||
_classify_by_position_fallback,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Margin Region Building
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_margin_regions(
|
||||
all_regions: List[PageRegion],
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
img_w: int,
|
||||
top_y: int,
|
||||
content_h: int,
|
||||
) -> List[PageRegion]:
|
||||
"""Create margin_left / margin_right PageRegions from content bounds.
|
||||
|
||||
Margins represent the space between the image edge and the first/last
|
||||
content column. They are used downstream for faithful page
|
||||
reconstruction but are skipped during OCR.
|
||||
"""
|
||||
margins: List[PageRegion] = []
|
||||
# Minimum gap (px) to create a margin region
|
||||
_min_gap = 5
|
||||
|
||||
if left_x > _min_gap:
|
||||
margins.append(PageRegion(
|
||||
type='margin_left', x=0, y=top_y,
|
||||
width=left_x, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='content_bounds',
|
||||
))
|
||||
|
||||
# Right margin: from end of last content column to image edge
|
||||
non_margin = [r for r in all_regions
|
||||
if r.type not in ('margin_left', 'margin_right', 'header', 'footer',
|
||||
'margin_top', 'margin_bottom')]
|
||||
if non_margin:
|
||||
last_col_end = max(r.x + r.width for r in non_margin)
|
||||
else:
|
||||
last_col_end = right_x
|
||||
if img_w - last_col_end > _min_gap:
|
||||
margins.append(PageRegion(
|
||||
type='margin_right', x=last_col_end, y=top_y,
|
||||
width=img_w - last_col_end, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='content_bounds',
|
||||
))
|
||||
|
||||
if margins:
|
||||
logger.info(f"Margins: {[(m.type, m.x, m.width) for m in margins]} "
|
||||
f"(left_x={left_x}, right_x={right_x}, img_w={img_w})")
|
||||
|
||||
return margins
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Positional Column Regions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def positional_column_regions(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
left_x: int,
|
||||
) -> List[PageRegion]:
|
||||
"""Classify columns by position only (no language scoring).
|
||||
|
||||
Structural columns (page_ref, column_marker) are identified by geometry.
|
||||
Remaining content columns are labelled left->right as column_en, column_de,
|
||||
column_example. The names are purely positional -- no language analysis.
|
||||
"""
|
||||
structural: List[PageRegion] = []
|
||||
content_cols: List[ColumnGeometry] = []
|
||||
|
||||
for g in geometries:
|
||||
rel_x = g.x - left_x
|
||||
# page_ref: narrow column in the leftmost 20% region
|
||||
if g.width_ratio < 0.12 and (rel_x / content_w if content_w else 0) < 0.20:
|
||||
structural.append(PageRegion(
|
||||
type='page_ref', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
# column_marker: very narrow, few words
|
||||
elif g.width_ratio < 0.06 and g.word_count <= 15:
|
||||
structural.append(PageRegion(
|
||||
type='column_marker', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
# empty or near-empty narrow column -> treat as margin/structural
|
||||
elif g.word_count <= 2 and g.width_ratio < 0.15:
|
||||
structural.append(PageRegion(
|
||||
type='column_marker', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.85,
|
||||
classification_method='positional',
|
||||
))
|
||||
else:
|
||||
content_cols.append(g)
|
||||
|
||||
# Single content column -> plain text page
|
||||
if len(content_cols) == 1:
|
||||
g = content_cols[0]
|
||||
return structural + [PageRegion(
|
||||
type='column_text', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.9,
|
||||
classification_method='positional',
|
||||
)]
|
||||
|
||||
# No content columns
|
||||
if not content_cols:
|
||||
return structural
|
||||
|
||||
# Sort content columns left->right and assign positional labels
|
||||
content_cols.sort(key=lambda g: g.x)
|
||||
|
||||
# With exactly 2 content columns: if the left one is very wide (>35%),
|
||||
# it likely contains EN+DE combined, so the right one is examples.
|
||||
if (len(content_cols) == 2
|
||||
and content_cols[0].width_ratio > 0.35
|
||||
and content_cols[1].width_ratio > 0.20):
|
||||
labels = ['column_en', 'column_example']
|
||||
else:
|
||||
labels = ['column_en', 'column_de', 'column_example']
|
||||
|
||||
regions = list(structural)
|
||||
for i, g in enumerate(content_cols):
|
||||
label = labels[i] if i < len(labels) else 'column_example'
|
||||
regions.append(PageRegion(
|
||||
type=label, x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='positional',
|
||||
))
|
||||
|
||||
logger.info(f"PositionalColumns: {len(structural)} structural, "
|
||||
f"{len(content_cols)} content -> "
|
||||
f"{[r.type for r in regions]}")
|
||||
return regions
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main Classification Entry Point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def classify_column_types(geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
top_y: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
bottom_y: int,
|
||||
left_x: int = 0,
|
||||
right_x: int = 0,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
document_category: Optional[str] = None,
|
||||
margin_strip_detected: bool = False) -> List[PageRegion]:
|
||||
"""Classify column types using a 3-level fallback chain.
|
||||
|
||||
Level 0: Dictionary detection (if signals are strong enough)
|
||||
Level 1: Content-based (language + role scoring)
|
||||
Level 2: Position + language (old rules enhanced with language detection)
|
||||
Level 3: Pure position (exact old code, no regression)
|
||||
|
||||
Args:
|
||||
geometries: List of ColumnGeometry from Phase A.
|
||||
content_w: Total content width.
|
||||
top_y: Top Y of content area.
|
||||
img_w: Full image width.
|
||||
img_h: Full image height.
|
||||
bottom_y: Bottom Y of content area.
|
||||
left_x: Left content bound (from _find_content_bounds).
|
||||
right_x: Right content bound (from _find_content_bounds).
|
||||
document_category: User-selected category (e.g. 'woerterbuch').
|
||||
margin_strip_detected: Whether a decorative A-Z margin strip was found.
|
||||
|
||||
Returns:
|
||||
List of PageRegion with types, confidence, and method.
|
||||
"""
|
||||
# _add_header_footer lives in cv_layout (avoids circular import at module
|
||||
# level). Lazy-import here so the module can be tested independently when
|
||||
# cv_layout hasn't been modified yet.
|
||||
from cv_layout_detection import _add_header_footer # noqa: E402
|
||||
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
def _with_margins(result: List[PageRegion]) -> List[PageRegion]:
|
||||
"""Append margin_left / margin_right regions to *result*."""
|
||||
margins = _build_margin_regions(result, left_x, right_x, img_w, top_y, content_h)
|
||||
return result + margins
|
||||
|
||||
# Special case: single column -> plain text page
|
||||
if len(geometries) == 1:
|
||||
geom = geometries[0]
|
||||
return _with_margins([PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=geom.height,
|
||||
classification_confidence=0.9,
|
||||
classification_method='content',
|
||||
)])
|
||||
|
||||
# --- Pre-filter: first/last columns with very few words -> column_ignore ---
|
||||
# Sub-columns from _detect_sub_columns() are exempt: they intentionally
|
||||
# have few words (page refs, markers) and should not be discarded.
|
||||
ignore_regions = []
|
||||
active_geometries = []
|
||||
for idx, g in enumerate(geometries):
|
||||
if (idx == 0 or idx == len(geometries) - 1) and g.word_count < 8 and not g.is_sub_column:
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_ignore', x=g.x, y=g.y,
|
||||
width=g.width, height=content_h,
|
||||
classification_confidence=0.95,
|
||||
classification_method='content',
|
||||
))
|
||||
logger.info(f"ClassifyColumns: column {idx} (x={g.x}, words={g.word_count}) -> column_ignore (edge, few words)")
|
||||
else:
|
||||
active_geometries.append(g)
|
||||
|
||||
# Re-index active geometries for classification
|
||||
for new_idx, g in enumerate(active_geometries):
|
||||
g.index = new_idx
|
||||
geometries = active_geometries
|
||||
|
||||
# Handle edge case: all columns ignored or only 1 left
|
||||
if len(geometries) == 0:
|
||||
return _with_margins(ignore_regions)
|
||||
if len(geometries) == 1:
|
||||
geom = geometries[0]
|
||||
ignore_regions.append(PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=geom.height,
|
||||
classification_confidence=0.9,
|
||||
classification_method='content',
|
||||
))
|
||||
return _with_margins(ignore_regions)
|
||||
|
||||
# --- Score all columns ---
|
||||
lang_scores = [_score_language(g.words) for g in geometries]
|
||||
role_scores = [_score_role(g) for g in geometries]
|
||||
|
||||
logger.info(f"ClassifyColumns: language scores: "
|
||||
f"{[(g.index, ls) for g, ls in zip(geometries, lang_scores)]}")
|
||||
logger.info(f"ClassifyColumns: role scores: "
|
||||
f"{[(g.index, rs) for g, rs in zip(geometries, role_scores)]}")
|
||||
|
||||
# --- Level 0: Dictionary detection ---
|
||||
dict_signals = _score_dictionary_signals(
|
||||
geometries,
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
)
|
||||
if dict_signals["is_dictionary"]:
|
||||
regions = _classify_dictionary_columns(
|
||||
geometries, dict_signals, lang_scores, content_h,
|
||||
)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 0 (dictionary) succeeded, confidence=%.3f",
|
||||
dict_signals["confidence"])
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 1: Content-based classification ---
|
||||
regions = _classify_by_content(geometries, lang_scores, role_scores, content_w, content_h)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 1 (content-based) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 2: Position + language enhanced ---
|
||||
regions = _classify_by_position_enhanced(geometries, lang_scores, content_w, content_h)
|
||||
if regions is not None:
|
||||
logger.info("ClassifyColumns: Level 2 (position+language) succeeded")
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
# --- Level 3: Pure position fallback (old code, no regression) ---
|
||||
logger.info("ClassifyColumns: Level 3 (position fallback)")
|
||||
regions = _classify_by_position_fallback(geometries, content_w, content_h)
|
||||
_add_header_footer(regions, top_y, bottom_y, img_w, img_h, inv=inv)
|
||||
return _with_margins(ignore_regions + regions)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 1: Content-Based Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_content(geometries: List[ColumnGeometry],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
role_scores: List[Dict[str, float]],
|
||||
content_w: int,
|
||||
content_h: int) -> Optional[List[PageRegion]]:
|
||||
"""Level 1: Classify columns purely by content analysis.
|
||||
|
||||
Requires clear language signals to distinguish EN/DE columns.
|
||||
Returns None if language signals are too weak.
|
||||
"""
|
||||
regions = []
|
||||
assigned = set()
|
||||
|
||||
# Step 1: Assign structural roles first (reference, marker)
|
||||
# left_20_threshold: only the leftmost ~20% of content area qualifies for page_ref
|
||||
left_20_threshold = geometries[0].x + content_w * 0.20 if geometries else 0
|
||||
|
||||
for i, (geom, rs, ls) in enumerate(zip(geometries, role_scores, lang_scores)):
|
||||
is_left_side = geom.x < left_20_threshold
|
||||
has_strong_language = ls['eng'] > 0.3 or ls['deu'] > 0.3
|
||||
if rs['reference'] >= 0.5 and geom.width_ratio < 0.12 and is_left_side and not has_strong_language:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=rs['reference'],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif rs['marker'] >= 0.7 and geom.width_ratio < 0.06:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=rs['marker'],
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
elif geom.width_ratio < 0.05 and not is_left_side:
|
||||
# Narrow column on the right side -> marker, not page_ref
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.8,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(i)
|
||||
|
||||
# Step 2: Among remaining columns, find EN and DE by language scores
|
||||
remaining = [(i, geometries[i], lang_scores[i], role_scores[i])
|
||||
for i in range(len(geometries)) if i not in assigned]
|
||||
|
||||
if len(remaining) < 2:
|
||||
# Not enough columns for EN/DE pair
|
||||
if len(remaining) == 1:
|
||||
i, geom, ls, rs = remaining[0]
|
||||
regions.append(PageRegion(
|
||||
type='column_text', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.6,
|
||||
classification_method='content',
|
||||
))
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
# Check if we have enough language signal
|
||||
en_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['eng'] > ls['deu'] and ls['eng'] > 0.05]
|
||||
de_candidates = [(i, g, ls) for i, g, ls, rs in remaining if ls['deu'] > ls['eng'] and ls['deu'] > 0.05]
|
||||
|
||||
# Position tiebreaker: when language signals are weak, use left=EN, right=DE
|
||||
if (not en_candidates or not de_candidates) and len(remaining) >= 2:
|
||||
max_eng = max(ls['eng'] for _, _, ls, _ in remaining)
|
||||
max_deu = max(ls['deu'] for _, _, ls, _ in remaining)
|
||||
if max_eng < 0.15 and max_deu < 0.15:
|
||||
# Both signals weak -- fall back to positional: left=EN, right=DE
|
||||
sorted_remaining = sorted(remaining, key=lambda x: x[1].x)
|
||||
best_en = (sorted_remaining[0][0], sorted_remaining[0][1], sorted_remaining[0][2])
|
||||
best_de = (sorted_remaining[1][0], sorted_remaining[1][1], sorted_remaining[1][2])
|
||||
logger.info("ClassifyColumns: Level 1 using position tiebreaker (weak signals) - left=EN, right=DE")
|
||||
en_conf = 0.4
|
||||
de_conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||||
width=best_en[1].width, height=content_h,
|
||||
classification_confidence=en_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_en[0])
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||||
width=best_de[1].width, height=content_h,
|
||||
classification_confidence=de_conf,
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_de[0])
|
||||
|
||||
# Assign remaining as example
|
||||
for i, geom, ls, rs in remaining:
|
||||
if i not in assigned:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.4,
|
||||
classification_method='content',
|
||||
))
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
if not en_candidates or not de_candidates:
|
||||
# Language signals too weak for content-based classification
|
||||
logger.info("ClassifyColumns: Level 1 failed - no clear EN/DE language split")
|
||||
return None
|
||||
|
||||
# Pick the best EN and DE candidates
|
||||
best_en = max(en_candidates, key=lambda x: x[2]['eng'])
|
||||
best_de = max(de_candidates, key=lambda x: x[2]['deu'])
|
||||
|
||||
# Position-aware EN selection: in typical textbooks the layout is EN | DE | Example.
|
||||
# Example sentences contain English function words ("the", "a", "is") which inflate
|
||||
# the eng score of the Example column. When the best EN candidate sits to the RIGHT
|
||||
# of the DE column and there is another EN candidate to the LEFT, prefer the left one
|
||||
# -- it is almost certainly the real vocabulary column.
|
||||
if best_de[2]['deu'] > 0.5 and best_en[1].x > best_de[1].x and len(en_candidates) > 1:
|
||||
left_of_de = [c for c in en_candidates if c[1].x < best_de[1].x]
|
||||
if left_of_de:
|
||||
alt_en = max(left_of_de, key=lambda x: x[2]['eng'])
|
||||
logger.info(
|
||||
f"ClassifyColumns: Level 1 position fix -- best EN col {best_en[0]} "
|
||||
f"(eng={best_en[2]['eng']:.3f}) is right of DE col {best_de[0]}; "
|
||||
f"preferring left col {alt_en[0]} (eng={alt_en[2]['eng']:.3f})")
|
||||
best_en = alt_en
|
||||
|
||||
if best_en[0] == best_de[0]:
|
||||
# Same column scored highest for both -- ambiguous
|
||||
logger.info("ClassifyColumns: Level 1 failed - same column highest for EN and DE")
|
||||
return None
|
||||
|
||||
en_conf = best_en[2]['eng']
|
||||
de_conf = best_de[2]['deu']
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=best_en[1].x, y=best_en[1].y,
|
||||
width=best_en[1].width, height=content_h,
|
||||
classification_confidence=round(en_conf, 2),
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_en[0])
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=best_de[1].x, y=best_de[1].y,
|
||||
width=best_de[1].width, height=content_h,
|
||||
classification_confidence=round(de_conf, 2),
|
||||
classification_method='content',
|
||||
))
|
||||
assigned.add(best_de[0])
|
||||
|
||||
# Step 3: Remaining columns -> example or text based on role scores
|
||||
for i, geom, ls, rs in remaining:
|
||||
if i in assigned:
|
||||
continue
|
||||
if rs['sentence'] > 0.4:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(rs['sentence'], 2),
|
||||
classification_method='content',
|
||||
))
|
||||
else:
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='content',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
218
klausur-service/backend/cv_layout_classify_position.py
Normal file
218
klausur-service/backend/cv_layout_classify_position.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
Position-based column type classification for OCR layout analysis.
|
||||
|
||||
Contains Level 2 and Level 3 classification functions:
|
||||
Level 2 – _classify_by_position_enhanced: Position + language confirmation
|
||||
Level 3 – _classify_by_position_fallback: Pure positional (no regression)
|
||||
|
||||
Extracted from cv_layout_classify.py during file-size split.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import ColumnGeometry, PageRegion
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 2: Position-Enhanced Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_position_enhanced(geometries: List[ColumnGeometry],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
content_w: int,
|
||||
content_h: int) -> Optional[List[PageRegion]]:
|
||||
"""Level 2: Position-based rules enhanced with language confirmation.
|
||||
|
||||
Uses the old positional heuristics but confirms EN/DE assignment
|
||||
with language scores (swapping if needed).
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_20_threshold = first_x + content_w * 0.20
|
||||
|
||||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%, no strong language)
|
||||
g0 = geometries[0]
|
||||
ls0 = lang_scores[0]
|
||||
has_strong_lang_0 = ls0['eng'] > 0.3 or ls0['deu'] > 0.3
|
||||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold and not has_strong_lang_0:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
classification_confidence=0.8,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(0)
|
||||
|
||||
# Rule 2: Narrow columns with few words -> marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.7,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(i)
|
||||
|
||||
# Rule 3: Rightmost remaining -> column_example (if 3+ remaining)
|
||||
if len(untyped) >= 3:
|
||||
last_idx = untyped[-1]
|
||||
geom = geometries[last_idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.7,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped.remove(last_idx)
|
||||
|
||||
# Rule 4: First two remaining -> EN/DE, but check language to possibly swap
|
||||
if len(untyped) >= 2:
|
||||
idx_a = untyped[0]
|
||||
idx_b = untyped[1]
|
||||
ls_a = lang_scores[idx_a]
|
||||
ls_b = lang_scores[idx_b]
|
||||
|
||||
# Default: first=EN, second=DE (old behavior)
|
||||
en_idx, de_idx = idx_a, idx_b
|
||||
conf = 0.7
|
||||
|
||||
# Swap if language signals clearly indicate the opposite
|
||||
if ls_a['deu'] > ls_a['eng'] and ls_b['eng'] > ls_b['deu']:
|
||||
en_idx, de_idx = idx_b, idx_a
|
||||
conf = 0.85
|
||||
logger.info(f"ClassifyColumns: Level 2 swapped EN/DE based on language scores")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||||
width=geometries[en_idx].width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||||
width=geometries[de_idx].width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped = untyped[2:]
|
||||
elif len(untyped) == 1:
|
||||
idx = untyped[0]
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
untyped = []
|
||||
|
||||
# Remaining -> example
|
||||
for idx in untyped:
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=0.5,
|
||||
classification_method='position_enhanced',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Level 3: Position Fallback Classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _classify_by_position_fallback(geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
content_h: int) -> List[PageRegion]:
|
||||
"""Level 3: Pure position-based fallback (identical to old code).
|
||||
|
||||
Guarantees no regression from the previous behavior.
|
||||
"""
|
||||
regions = []
|
||||
untyped = list(range(len(geometries)))
|
||||
first_x = geometries[0].x if geometries else 0
|
||||
left_20_threshold = first_x + content_w * 0.20
|
||||
|
||||
# Rule 1: Leftmost narrow column -> page_ref (only if in left 20%)
|
||||
g0 = geometries[0]
|
||||
if g0.width_ratio < 0.12 and g0.x < left_20_threshold:
|
||||
regions.append(PageRegion(
|
||||
type='page_ref', x=g0.x, y=g0.y,
|
||||
width=g0.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(0)
|
||||
|
||||
# Rule 2: Narrow + few words -> marker
|
||||
for i in list(untyped):
|
||||
geom = geometries[i]
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
regions.append(PageRegion(
|
||||
type='column_marker', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(i)
|
||||
|
||||
# Rule 3: Rightmost remaining -> example (if 3+)
|
||||
if len(untyped) >= 3:
|
||||
last_idx = untyped[-1]
|
||||
geom = geometries[last_idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped.remove(last_idx)
|
||||
|
||||
# Rule 4: First remaining -> EN, second -> DE
|
||||
if len(untyped) >= 2:
|
||||
en_idx = untyped[0]
|
||||
de_idx = untyped[1]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geometries[en_idx].x, y=geometries[en_idx].y,
|
||||
width=geometries[en_idx].width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=geometries[de_idx].x, y=geometries[de_idx].y,
|
||||
width=geometries[de_idx].width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped = untyped[2:]
|
||||
elif len(untyped) == 1:
|
||||
idx = untyped[0]
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
untyped = []
|
||||
|
||||
for idx in untyped:
|
||||
geom = geometries[idx]
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=1.0,
|
||||
classification_method='position_fallback',
|
||||
))
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
458
klausur-service/backend/cv_layout_column_refine.py
Normal file
458
klausur-service/backend/cv_layout_column_refine.py
Normal file
@@ -0,0 +1,458 @@
|
||||
"""
|
||||
Post-processing refinements for column geometry.
|
||||
|
||||
Extracted from cv_layout_columns.py — contains:
|
||||
- _detect_sub_columns() (sub-column detection via left-edge alignment)
|
||||
- _split_broad_columns() (broad column splitting via word-coverage gaps)
|
||||
- expand_narrow_columns() (narrow column expansion into whitespace)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import statistics
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _detect_sub_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
top_y: int = 0,
|
||||
header_y: Optional[int] = None,
|
||||
footer_y: Optional[int] = None,
|
||||
_edge_tolerance: int = 8,
|
||||
_min_col_start_ratio: float = 0.10,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split columns that contain internal sub-columns based on left-edge alignment.
|
||||
|
||||
For each column, clusters word left-edges into alignment bins (within
|
||||
``_edge_tolerance`` px). The leftmost bin whose word count reaches
|
||||
``_min_col_start_ratio`` of the column total is treated as the true column
|
||||
start. Any words to the left of that bin form a sub-column, provided they
|
||||
number >= 2 and < 35 % of total.
|
||||
|
||||
Word ``left`` values are relative to the content ROI (offset by *left_x*),
|
||||
while ``ColumnGeometry.x`` is in absolute image coordinates. *left_x*
|
||||
bridges the two coordinate systems.
|
||||
|
||||
If *header_y* / *footer_y* are provided (absolute y-coordinates), words
|
||||
in header/footer regions are excluded from alignment clustering to avoid
|
||||
polluting the bins with page numbers or chapter titles. Word ``top``
|
||||
values are relative to *top_y*.
|
||||
|
||||
Returns a new list of ColumnGeometry — potentially longer than the input.
|
||||
"""
|
||||
if content_w <= 0:
|
||||
return geometries
|
||||
|
||||
result: List[ColumnGeometry] = []
|
||||
for geo in geometries:
|
||||
# Only consider wide-enough columns with enough words
|
||||
if geo.width_ratio < 0.15 or geo.word_count < 5:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Collect left-edges of confident words, excluding header/footer
|
||||
# Convert header_y/footer_y from absolute to relative (word 'top' is relative to top_y)
|
||||
min_top_rel = (header_y - top_y) if header_y is not None else None
|
||||
max_top_rel = (footer_y - top_y) if footer_y is not None else None
|
||||
|
||||
confident = [w for w in geo.words
|
||||
if w.get('conf', 0) >= 30
|
||||
and (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(confident) < 3:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Cluster left-edges into alignment bins ---
|
||||
sorted_edges = sorted(w['left'] for w in confident)
|
||||
bins: List[Tuple[int, int, int, int]] = [] # (center, count, min_edge, max_edge)
|
||||
cur = [sorted_edges[0]]
|
||||
for i in range(1, len(sorted_edges)):
|
||||
if sorted_edges[i] - cur[-1] <= _edge_tolerance:
|
||||
cur.append(sorted_edges[i])
|
||||
else:
|
||||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||||
cur = [sorted_edges[i]]
|
||||
bins.append((sum(cur) // len(cur), len(cur), min(cur), max(cur)))
|
||||
|
||||
# --- Find the leftmost bin qualifying as a real column start ---
|
||||
total = len(confident)
|
||||
min_count = max(3, int(total * _min_col_start_ratio))
|
||||
col_start_bin = None
|
||||
for b in bins:
|
||||
if b[1] >= min_count:
|
||||
col_start_bin = b
|
||||
break
|
||||
|
||||
if col_start_bin is None:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Words to the left of the column-start bin are sub-column candidates
|
||||
split_threshold = col_start_bin[2] - _edge_tolerance
|
||||
sub_words = [w for w in geo.words if w['left'] < split_threshold]
|
||||
main_words = [w for w in geo.words if w['left'] >= split_threshold]
|
||||
|
||||
# Count only body words (excluding header/footer) for the threshold check
|
||||
# so that header/footer words don't artificially trigger a split.
|
||||
sub_body = [w for w in sub_words
|
||||
if (min_top_rel is None or w['top'] >= min_top_rel)
|
||||
and (max_top_rel is None or w['top'] <= max_top_rel)]
|
||||
if len(sub_body) < 2 or len(sub_body) / len(geo.words) >= 0.35:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Guard against inline markers (bullet points, numbering) ---
|
||||
# Bullet points like "1.", "2.", "•", "-" sit close to the main
|
||||
# column text and are part of the cell, not a separate column.
|
||||
# Only split if the horizontal gap between the rightmost sub-word
|
||||
# and the main column start is large enough.
|
||||
max_sub_right = max(w['left'] + w.get('width', 0) for w in sub_words)
|
||||
gap_to_main = col_start_bin[2] - max_sub_right # px gap
|
||||
median_heights = [w.get('height', 20) for w in confident]
|
||||
med_h = statistics.median(median_heights) if median_heights else 20
|
||||
min_gap = max(med_h * 1.2, 20) # at least 1.2× word height or 20px
|
||||
if gap_to_main < min_gap:
|
||||
logger.debug(
|
||||
"SubColumnSplit: column idx=%d skipped — gap=%dpx < min=%dpx "
|
||||
"(likely inline markers, not a sub-column)",
|
||||
geo.index, gap_to_main, min_gap)
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# --- Build two sub-column geometries ---
|
||||
# Word 'left' values are relative to left_x; geo.x is absolute.
|
||||
# Convert the split position from relative to absolute coordinates.
|
||||
max_sub_left = max(w['left'] for w in sub_words)
|
||||
split_rel = (max_sub_left + col_start_bin[2]) // 2
|
||||
split_abs = split_rel + left_x
|
||||
|
||||
sub_x = geo.x
|
||||
sub_width = split_abs - geo.x
|
||||
main_x = split_abs
|
||||
main_width = (geo.x + geo.width) - split_abs
|
||||
|
||||
if sub_width <= 0 or main_width <= 0:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
sub_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=sub_x,
|
||||
y=geo.y,
|
||||
width=sub_width,
|
||||
height=geo.height,
|
||||
word_count=len(sub_words),
|
||||
words=sub_words,
|
||||
width_ratio=sub_width / content_w if content_w > 0 else 0.0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
main_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=main_x,
|
||||
y=geo.y,
|
||||
width=main_width,
|
||||
height=geo.height,
|
||||
word_count=len(main_words),
|
||||
words=main_words,
|
||||
width_ratio=main_width / content_w if content_w > 0 else 0.0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
|
||||
result.append(sub_geo)
|
||||
result.append(main_geo)
|
||||
|
||||
logger.info(
|
||||
f"SubColumnSplit: column idx={geo.index} split at abs_x={split_abs} "
|
||||
f"(rel={split_rel}), sub={len(sub_words)} words, "
|
||||
f"main={len(main_words)} words, "
|
||||
f"col_start_bin=({col_start_bin[0]}, n={col_start_bin[1]})"
|
||||
)
|
||||
|
||||
# Re-index by left-to-right order
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_broad_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int = 0,
|
||||
_broad_threshold: float = 0.35,
|
||||
_min_gap_px: int = 15,
|
||||
_min_words_per_split: int = 5,
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Split overly broad columns that contain two language blocks (EN+DE).
|
||||
|
||||
Uses word-coverage gap analysis: builds a per-pixel coverage array from the
|
||||
words inside each broad column, finds the largest horizontal gap, and splits
|
||||
the column at that gap.
|
||||
|
||||
Args:
|
||||
geometries: Column geometries from _detect_sub_columns.
|
||||
content_w: Width of the content area in pixels.
|
||||
left_x: Left edge of content ROI in absolute image coordinates.
|
||||
_broad_threshold: Minimum width_ratio to consider a column "broad".
|
||||
_min_gap_px: Minimum gap width (pixels) to trigger a split.
|
||||
_min_words_per_split: Both halves must have at least this many words.
|
||||
|
||||
Returns:
|
||||
Updated list of ColumnGeometry (possibly with more columns).
|
||||
"""
|
||||
result: List[ColumnGeometry] = []
|
||||
|
||||
logger.info(f"SplitBroadCols: input {len(geometries)} cols: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count, round(g.width_ratio, 3)) for g in geometries]}")
|
||||
|
||||
for geo in geometries:
|
||||
if geo.width_ratio <= _broad_threshold or len(geo.words) < 10:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build word-coverage array (per pixel within column)
|
||||
col_left_rel = geo.x - left_x # column left in content-relative coords
|
||||
coverage = np.zeros(geo.width, dtype=np.float32)
|
||||
|
||||
for wd in geo.words:
|
||||
# wd['left'] is relative to left_x (content ROI)
|
||||
wl = wd['left'] - col_left_rel
|
||||
wr = wl + wd.get('width', 0)
|
||||
wl = max(0, int(wl))
|
||||
wr = min(geo.width, int(wr))
|
||||
if wr > wl:
|
||||
coverage[wl:wr] += 1.0
|
||||
|
||||
# Light smoothing (kernel=3px) to avoid noise
|
||||
if len(coverage) > 3:
|
||||
kernel = np.ones(3, dtype=np.float32) / 3.0
|
||||
coverage = np.convolve(coverage, kernel, mode='same')
|
||||
|
||||
# Normalise to [0, 1]
|
||||
cmax = coverage.max()
|
||||
if cmax > 0:
|
||||
coverage /= cmax
|
||||
|
||||
# Find INTERNAL gaps where coverage < 0.5
|
||||
# Exclude edge gaps (touching pixel 0 or geo.width) — those are margins.
|
||||
low_mask = coverage < 0.5
|
||||
all_gaps = []
|
||||
_gs = None
|
||||
for px in range(len(low_mask)):
|
||||
if low_mask[px]:
|
||||
if _gs is None:
|
||||
_gs = px
|
||||
else:
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, px, px - _gs))
|
||||
_gs = None
|
||||
if _gs is not None:
|
||||
all_gaps.append((_gs, len(low_mask), len(low_mask) - _gs))
|
||||
|
||||
# Filter: only internal gaps (not touching column edges)
|
||||
_edge_margin = 10 # pixels from edge to ignore
|
||||
internal_gaps = [g for g in all_gaps
|
||||
if g[0] > _edge_margin and g[1] < geo.width - _edge_margin]
|
||||
best_gap = max(internal_gaps, key=lambda g: g[2]) if internal_gaps else None
|
||||
|
||||
logger.info(f"SplitBroadCols: col {geo.index} all_gaps(>=5px): "
|
||||
f"{[g for g in all_gaps if g[2] >= 5]}, "
|
||||
f"internal(>=5px): {[g for g in internal_gaps if g[2] >= 5]}, "
|
||||
f"best={best_gap}")
|
||||
|
||||
if best_gap is None or best_gap[2] < _min_gap_px:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
gap_center = (best_gap[0] + best_gap[1]) // 2
|
||||
|
||||
# Split words by midpoint relative to gap
|
||||
left_words = []
|
||||
right_words = []
|
||||
for wd in geo.words:
|
||||
wl = wd['left'] - col_left_rel
|
||||
mid = wl + wd.get('width', 0) / 2.0
|
||||
if mid < gap_center:
|
||||
left_words.append(wd)
|
||||
else:
|
||||
right_words.append(wd)
|
||||
|
||||
if len(left_words) < _min_words_per_split or len(right_words) < _min_words_per_split:
|
||||
result.append(geo)
|
||||
continue
|
||||
|
||||
# Build two new ColumnGeometry objects
|
||||
split_x_abs = geo.x + gap_center
|
||||
left_w = gap_center
|
||||
right_w = geo.width - gap_center
|
||||
|
||||
left_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=geo.x,
|
||||
y=geo.y,
|
||||
width=left_w,
|
||||
height=geo.height,
|
||||
word_count=len(left_words),
|
||||
words=left_words,
|
||||
width_ratio=left_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
right_geo = ColumnGeometry(
|
||||
index=0,
|
||||
x=split_x_abs,
|
||||
y=geo.y,
|
||||
width=right_w,
|
||||
height=geo.height,
|
||||
word_count=len(right_words),
|
||||
words=right_words,
|
||||
width_ratio=right_w / content_w if content_w else 0,
|
||||
is_sub_column=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"SplitBroadCols: col {geo.index} SPLIT at gap_center={gap_center} "
|
||||
f"(gap {best_gap[2]}px @ [{best_gap[0]}..{best_gap[1]}]), "
|
||||
f"left={len(left_words)} words (w={left_w}), "
|
||||
f"right={len(right_words)} words (w={right_w})"
|
||||
)
|
||||
|
||||
result.append(left_geo)
|
||||
result.append(right_geo)
|
||||
|
||||
# Re-index left-to-right
|
||||
result.sort(key=lambda g: g.x)
|
||||
for i, g in enumerate(result):
|
||||
g.index = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def expand_narrow_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
content_w: int,
|
||||
left_x: int,
|
||||
word_dicts: List[Dict],
|
||||
) -> List[ColumnGeometry]:
|
||||
"""Expand narrow columns into adjacent whitespace gaps.
|
||||
|
||||
Narrow columns (marker, page_ref, < 10% content width) often lose
|
||||
content at image edges due to residual shear. This expands them toward
|
||||
the neighbouring column, but never past 40% of the gap or past the
|
||||
nearest word in the neighbour.
|
||||
|
||||
Must be called AFTER _detect_sub_columns() so that sub-column splits
|
||||
(which create the narrowest columns) have already happened.
|
||||
"""
|
||||
_NARROW_THRESHOLD_PCT = 10.0
|
||||
_MIN_WORD_MARGIN = 4
|
||||
|
||||
if len(geometries) < 2:
|
||||
return geometries
|
||||
|
||||
logger.info("ExpandNarrowCols: input %d cols: %s",
|
||||
len(geometries),
|
||||
[(i, g.x, g.width, round(g.width / content_w * 100, 1))
|
||||
for i, g in enumerate(geometries)])
|
||||
|
||||
for i, g in enumerate(geometries):
|
||||
col_pct = g.width / content_w * 100 if content_w > 0 else 100
|
||||
if col_pct >= _NARROW_THRESHOLD_PCT:
|
||||
continue
|
||||
|
||||
expanded = False
|
||||
orig_pct = col_pct
|
||||
|
||||
# --- try expanding to the LEFT ---
|
||||
if i > 0:
|
||||
left_nb = geometries[i - 1]
|
||||
# Gap can be 0 if sub-column split created adjacent columns.
|
||||
# In that case, look at where the neighbor's rightmost words
|
||||
# actually are — there may be unused space we can claim.
|
||||
nb_words_right = [wd['left'] + wd.get('width', 0)
|
||||
for wd in left_nb.words]
|
||||
if nb_words_right:
|
||||
rightmost_word_abs = left_x + max(nb_words_right)
|
||||
safe_left_abs = rightmost_word_abs + _MIN_WORD_MARGIN
|
||||
else:
|
||||
# No words in neighbor → we can take up to neighbor's start
|
||||
safe_left_abs = left_nb.x + _MIN_WORD_MARGIN
|
||||
|
||||
if safe_left_abs < g.x:
|
||||
g.width += (g.x - safe_left_abs)
|
||||
g.x = safe_left_abs
|
||||
expanded = True
|
||||
|
||||
# --- try expanding to the RIGHT ---
|
||||
if i + 1 < len(geometries):
|
||||
right_nb = geometries[i + 1]
|
||||
nb_words_left = [wd['left'] for wd in right_nb.words]
|
||||
if nb_words_left:
|
||||
leftmost_word_abs = left_x + min(nb_words_left)
|
||||
safe_right_abs = leftmost_word_abs - _MIN_WORD_MARGIN
|
||||
else:
|
||||
safe_right_abs = right_nb.x + right_nb.width - _MIN_WORD_MARGIN
|
||||
|
||||
cur_right = g.x + g.width
|
||||
if safe_right_abs > cur_right:
|
||||
g.width = safe_right_abs - g.x
|
||||
expanded = True
|
||||
|
||||
if expanded:
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
g.words = [wd for wd in word_dicts
|
||||
if col_left_rel <= wd['left'] < col_right_rel]
|
||||
g.word_count = len(g.words)
|
||||
g.width_ratio = g.width / content_w if content_w > 0 else 0.0
|
||||
logger.info(
|
||||
"ExpandNarrowCols: col %d (%.1f%% → %.1f%%) x=%d w=%d words=%d",
|
||||
i, orig_pct, g.width / content_w * 100, g.x, g.width, g.word_count)
|
||||
|
||||
# --- Shrink overlapping neighbors to match new boundaries ---
|
||||
# Left neighbor: its right edge must not exceed our new left edge
|
||||
if i > 0:
|
||||
left_nb = geometries[i - 1]
|
||||
nb_right = left_nb.x + left_nb.width
|
||||
if nb_right > g.x:
|
||||
left_nb.width = g.x - left_nb.x
|
||||
if left_nb.width < 0:
|
||||
left_nb.width = 0
|
||||
left_nb.width_ratio = left_nb.width / content_w if content_w > 0 else 0.0
|
||||
# Re-assign words
|
||||
nb_left_rel = left_nb.x - left_x
|
||||
nb_right_rel = nb_left_rel + left_nb.width
|
||||
left_nb.words = [wd for wd in word_dicts
|
||||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||||
left_nb.word_count = len(left_nb.words)
|
||||
|
||||
# Right neighbor: its left edge must not be before our new right edge
|
||||
if i + 1 < len(geometries):
|
||||
right_nb = geometries[i + 1]
|
||||
my_right = g.x + g.width
|
||||
if right_nb.x < my_right:
|
||||
old_right_edge = right_nb.x + right_nb.width
|
||||
right_nb.x = my_right
|
||||
right_nb.width = old_right_edge - right_nb.x
|
||||
if right_nb.width < 0:
|
||||
right_nb.width = 0
|
||||
right_nb.width_ratio = right_nb.width / content_w if content_w > 0 else 0.0
|
||||
# Re-assign words
|
||||
nb_left_rel = right_nb.x - left_x
|
||||
nb_right_rel = nb_left_rel + right_nb.width
|
||||
right_nb.words = [wd for wd in word_dicts
|
||||
if nb_left_rel <= wd['left'] < nb_right_rel]
|
||||
right_nb.word_count = len(right_nb.words)
|
||||
|
||||
return geometries
|
||||
589
klausur-service/backend/cv_layout_columns.py
Normal file
589
klausur-service/backend/cv_layout_columns.py
Normal file
@@ -0,0 +1,589 @@
|
||||
"""
|
||||
Core column detection: gap-based geometry and clustering fallback.
|
||||
|
||||
Extracted from the original cv_layout_columns.py — contains:
|
||||
- _detect_columns_by_clustering() (fallback clustering)
|
||||
- _build_geometries_from_starts() (geometry construction)
|
||||
- detect_column_geometry() (main column detection)
|
||||
|
||||
Post-processing (sub-columns, broad-column split, narrow expansion)
|
||||
lives in cv_layout_column_refine.py.
|
||||
Legacy projection-profile layout lives in cv_layout_analyze.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
from cv_layout_detection import _find_content_bounds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
Image = None # type: ignore[assignment,misc]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Stage 5b: Word-Based Layout Analysis (Two-Phase Column Detection)
|
||||
# =============================================================================
|
||||
|
||||
# --- Phase A: Geometry Detection ---
|
||||
|
||||
def _detect_columns_by_clustering(
|
||||
word_dicts: List[Dict],
|
||||
left_edges: List[int],
|
||||
edge_word_indices: List[int],
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
||||
"""Fallback: detect columns by clustering left-aligned word positions.
|
||||
|
||||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||||
"""
|
||||
tolerance = max(10, int(content_w * 0.01))
|
||||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||||
|
||||
clusters = []
|
||||
cluster_widxs = []
|
||||
cur_edges = [sorted_pairs[0][0]]
|
||||
cur_widxs = [sorted_pairs[0][1]]
|
||||
for edge, widx in sorted_pairs[1:]:
|
||||
if edge - cur_edges[-1] <= tolerance:
|
||||
cur_edges.append(edge)
|
||||
cur_widxs.append(widx)
|
||||
else:
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
cur_edges = [edge]
|
||||
cur_widxs = [widx]
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
|
||||
MIN_Y_COVERAGE_PRIMARY = 0.30
|
||||
MIN_Y_COVERAGE_SECONDARY = 0.15
|
||||
MIN_WORDS_SECONDARY = 5
|
||||
|
||||
cluster_infos = []
|
||||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||||
if len(c_edges) < 2:
|
||||
continue
|
||||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||||
y_span = max(y_positions) - min(y_positions)
|
||||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||||
cluster_infos.append({
|
||||
'mean_x': int(np.mean(c_edges)),
|
||||
'count': len(c_edges),
|
||||
'min_edge': min(c_edges),
|
||||
'max_edge': max(c_edges),
|
||||
'y_min': min(y_positions),
|
||||
'y_max': max(y_positions),
|
||||
'y_coverage': y_coverage,
|
||||
})
|
||||
|
||||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||||
primary_set = set(id(c) for c in primary)
|
||||
secondary = [c for c in cluster_infos
|
||||
if id(c) not in primary_set
|
||||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||||
|
||||
if len(significant) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
||||
return None
|
||||
|
||||
merge_distance = max(30, int(content_w * 0.06))
|
||||
merged = [significant[0].copy()]
|
||||
for s in significant[1:]:
|
||||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||||
prev = merged[-1]
|
||||
total = prev['count'] + s['count']
|
||||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||||
prev['mean_x'] = avg_x
|
||||
prev['count'] = total
|
||||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||||
else:
|
||||
merged.append(s.copy())
|
||||
|
||||
if len(merged) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
||||
return None
|
||||
|
||||
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
||||
|
||||
margin_px = max(6, int(content_w * 0.003))
|
||||
return _build_geometries_from_starts(
|
||||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
||||
)
|
||||
|
||||
|
||||
def _build_geometries_from_starts(
|
||||
col_starts: List[Tuple[int, int]],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
inv: Optional[np.ndarray] = None,
|
||||
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
||||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||||
geometries = []
|
||||
for i, (start_x, count) in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
col_width = col_starts[i + 1][0] - start_x
|
||||
else:
|
||||
col_width = right_x - start_x
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
|
||||
geometries.append(ColumnGeometry(
|
||||
index=i,
|
||||
x=start_x,
|
||||
y=top_y,
|
||||
width=col_width,
|
||||
height=content_h,
|
||||
word_count=len(col_words),
|
||||
words=col_words,
|
||||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||||
))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
|
||||
|
||||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
||||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||||
|
||||
Phase A of the two-phase column detection. Uses vertical projection
|
||||
profiles to find whitespace gaps between columns, then validates that
|
||||
no gap cuts through a word bounding box.
|
||||
|
||||
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized grayscale image for layout analysis.
|
||||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||||
|
||||
Returns:
|
||||
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
or None if detection fails entirely.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# --- Step 1: Find content bounds ---
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||
left_x, right_x = 0, w
|
||||
top_y, bottom_y = 0, h
|
||||
content_w, content_h = w, h
|
||||
|
||||
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||||
|
||||
# --- Step 2: Get word bounding boxes from Tesseract ---
|
||||
# Crop from left_x to full image width (not right_x) so words at the right
|
||||
# edge of the last column are included even if they extend past the detected
|
||||
# content boundary (right_x).
|
||||
content_roi = dewarped_bgr[top_y:bottom_y, left_x:w]
|
||||
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||||
|
||||
try:
|
||||
data = pytesseract.image_to_data(pil_img, lang='eng+deu', output_type=pytesseract.Output.DICT)
|
||||
except Exception as e:
|
||||
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
||||
return None
|
||||
|
||||
word_dicts = []
|
||||
left_edges = []
|
||||
edge_word_indices = []
|
||||
n_words = len(data['text'])
|
||||
for i in range(n_words):
|
||||
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||||
text = str(data['text'][i]).strip()
|
||||
if conf < 30 or not text:
|
||||
continue
|
||||
lx = int(data['left'][i])
|
||||
ty = int(data['top'][i])
|
||||
bw = int(data['width'][i])
|
||||
bh = int(data['height'][i])
|
||||
left_edges.append(lx)
|
||||
edge_word_indices.append(len(word_dicts))
|
||||
word_dicts.append({
|
||||
'text': text, 'conf': conf,
|
||||
'left': lx, 'top': ty, 'width': bw, 'height': bh,
|
||||
})
|
||||
|
||||
if len(left_edges) < 5:
|
||||
logger.warning(f"ColumnGeometry: only {len(left_edges)} words detected")
|
||||
return None
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||||
|
||||
# --- Step 2b: Segment by sub-headers ---
|
||||
# Pages with sub-headers (e.g. "Unit 4: Bonnie Scotland") have full-width
|
||||
# text bands that pollute the vertical projection. We detect large
|
||||
# horizontal gaps (= whitespace rows separating sections) and use only
|
||||
# the tallest content segment for the projection. This makes column
|
||||
# detection immune to sub-headers, illustrations, and section dividers.
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
h_proj_row = np.sum(content_strip, axis=1).astype(float)
|
||||
h_proj_row_norm = h_proj_row / (content_w * 255) if content_w > 0 else h_proj_row
|
||||
|
||||
# Find horizontal gaps (near-empty rows)
|
||||
H_GAP_THRESH = 0.02 # rows with <2% ink density are "empty"
|
||||
h_in_gap = h_proj_row_norm < H_GAP_THRESH
|
||||
H_MIN_GAP = max(5, content_h // 200) # min gap height ~5-7px
|
||||
|
||||
h_gaps: List[Tuple[int, int]] = []
|
||||
h_gap_start = None
|
||||
for y_idx in range(len(h_in_gap)):
|
||||
if h_in_gap[y_idx]:
|
||||
if h_gap_start is None:
|
||||
h_gap_start = y_idx
|
||||
else:
|
||||
if h_gap_start is not None:
|
||||
if y_idx - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, y_idx))
|
||||
h_gap_start = None
|
||||
if h_gap_start is not None and len(h_in_gap) - h_gap_start >= H_MIN_GAP:
|
||||
h_gaps.append((h_gap_start, len(h_in_gap)))
|
||||
|
||||
# Identify "large" gaps (significantly bigger than median) that indicate
|
||||
# section boundaries (sub-headers, chapter titles).
|
||||
if len(h_gaps) >= 3:
|
||||
gap_sizes = sorted(g[1] - g[0] for g in h_gaps)
|
||||
median_gap_h = gap_sizes[len(gap_sizes) // 2]
|
||||
large_gap_thresh = max(median_gap_h * 1.8, H_MIN_GAP + 3)
|
||||
large_gaps = [(gs, ge) for gs, ge in h_gaps if ge - gs >= large_gap_thresh]
|
||||
else:
|
||||
large_gaps = h_gaps
|
||||
|
||||
# Build content segments between large gaps and pick the tallest
|
||||
seg_boundaries = [0]
|
||||
for gs, ge in large_gaps:
|
||||
seg_boundaries.append(gs)
|
||||
seg_boundaries.append(ge)
|
||||
seg_boundaries.append(content_h)
|
||||
|
||||
segments = []
|
||||
for i in range(0, len(seg_boundaries) - 1, 2):
|
||||
seg_top = seg_boundaries[i]
|
||||
seg_bot = seg_boundaries[i + 1] if i + 1 < len(seg_boundaries) else content_h
|
||||
seg_height = seg_bot - seg_top
|
||||
if seg_height > 20: # ignore tiny fragments
|
||||
segments.append((seg_top, seg_bot, seg_height))
|
||||
|
||||
if segments:
|
||||
segments.sort(key=lambda s: s[2], reverse=True)
|
||||
best_seg = segments[0]
|
||||
proj_strip = content_strip[best_seg[0]:best_seg[1], :]
|
||||
effective_h = best_seg[2]
|
||||
if len(segments) > 1:
|
||||
logger.info(f"ColumnGeometry: {len(segments)} segments from {len(large_gaps)} "
|
||||
f"large h-gaps, using tallest: rows {best_seg[0]}..{best_seg[1]} "
|
||||
f"({effective_h}px, {effective_h*100/content_h:.0f}%)")
|
||||
else:
|
||||
proj_strip = content_strip
|
||||
effective_h = content_h
|
||||
|
||||
# --- Step 3: Vertical projection profile ---
|
||||
v_proj = np.sum(proj_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (effective_h * 255) if effective_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection to avoid noise-induced micro-gaps
|
||||
kernel_size = max(5, content_w // 80)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1 # keep odd for symmetry
|
||||
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# --- Step 4: Find whitespace gaps ---
|
||||
# Threshold: areas with very little ink density are gaps
|
||||
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.005)
|
||||
|
||||
in_gap = v_smooth < gap_threshold
|
||||
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
||||
|
||||
# Collect contiguous gap regions
|
||||
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
||||
gap_start = None
|
||||
for x in range(len(in_gap)):
|
||||
if in_gap[x]:
|
||||
if gap_start is None:
|
||||
gap_start = x
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_width = x - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, x))
|
||||
gap_start = None
|
||||
# Handle gap at the right edge
|
||||
if gap_start is not None:
|
||||
gap_width = len(in_gap) - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||
f"min_width={MIN_GAP_WIDTH}px): "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||||
|
||||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||||
# When using a segment for projection, only validate against words
|
||||
# inside that segment — words from sub-headers or other sections
|
||||
# would incorrectly overlap with real column gaps.
|
||||
if segments and len(segments) > 1:
|
||||
seg_top_abs = best_seg[0] # relative to content strip
|
||||
seg_bot_abs = best_seg[1]
|
||||
segment_words = [wd for wd in word_dicts
|
||||
if wd['top'] >= seg_top_abs
|
||||
and wd['top'] + wd['height'] <= seg_bot_abs]
|
||||
logger.info(f"ColumnGeometry: filtering words to segment: "
|
||||
f"{len(segment_words)}/{len(word_dicts)} words")
|
||||
else:
|
||||
segment_words = word_dicts
|
||||
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
# Check if any word overlaps with this gap region
|
||||
overlapping = False
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
overlapping = True
|
||||
break
|
||||
|
||||
if not overlapping:
|
||||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||
else:
|
||||
# Try to shift the gap to avoid the overlapping word(s)
|
||||
# Find the tightest word boundaries within the gap region
|
||||
min_word_left = content_w
|
||||
max_word_right = 0
|
||||
for wd in segment_words:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
min_word_left = min(min_word_left, word_left)
|
||||
max_word_right = max(max_word_right, word_right)
|
||||
|
||||
# Try gap before the overlapping words
|
||||
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((gap_start_rel, min_word_left))
|
||||
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
||||
# Try gap after the overlapping words
|
||||
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((max_word_right, gap_end_rel))
|
||||
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
||||
else:
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"discarded (word overlap, no room to shift)")
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
||||
|
||||
# --- Step 5b: Word-coverage gap detection (fallback for noisy scans) ---
|
||||
# When pixel-based projection fails (e.g. due to illustrations or colored
|
||||
# bands), use word bounding boxes to find clear vertical gaps. This is
|
||||
# immune to decorative graphics that Tesseract doesn't recognise as words.
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 pixel-gaps, trying word-coverage gaps")
|
||||
word_coverage = np.zeros(content_w, dtype=np.int32)
|
||||
for wd in segment_words:
|
||||
wl = max(0, wd['left'])
|
||||
wr = min(wd['left'] + wd['width'], content_w)
|
||||
if wr > wl:
|
||||
word_coverage[wl:wr] += 1
|
||||
|
||||
# Smooth slightly to bridge tiny 1-2px noise gaps between words
|
||||
wc_kernel = max(3, content_w // 300)
|
||||
if wc_kernel % 2 == 0:
|
||||
wc_kernel += 1
|
||||
wc_smooth = np.convolve(word_coverage.astype(float),
|
||||
np.ones(wc_kernel) / wc_kernel, mode='same')
|
||||
|
||||
wc_in_gap = wc_smooth < 0.5 # effectively zero word coverage
|
||||
WC_MIN_GAP = max(4, content_w // 300)
|
||||
|
||||
wc_gaps: List[Tuple[int, int]] = []
|
||||
wc_gap_start = None
|
||||
for x in range(len(wc_in_gap)):
|
||||
if wc_in_gap[x]:
|
||||
if wc_gap_start is None:
|
||||
wc_gap_start = x
|
||||
else:
|
||||
if wc_gap_start is not None:
|
||||
if x - wc_gap_start >= WC_MIN_GAP:
|
||||
wc_gaps.append((wc_gap_start, x))
|
||||
wc_gap_start = None
|
||||
if wc_gap_start is not None and len(wc_in_gap) - wc_gap_start >= WC_MIN_GAP:
|
||||
wc_gaps.append((wc_gap_start, len(wc_in_gap)))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(wc_gaps)} word-coverage gaps found "
|
||||
f"(min_width={WC_MIN_GAP}px): "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in wc_gaps]}")
|
||||
|
||||
if len(wc_gaps) >= 2:
|
||||
validated_gaps = wc_gaps
|
||||
|
||||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||||
return _detect_columns_by_clustering(
|
||||
word_dicts, left_edges, edge_word_indices,
|
||||
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
||||
)
|
||||
|
||||
# --- Step 7: Derive column boundaries from gaps ---
|
||||
# Sort gaps by position
|
||||
validated_gaps.sort(key=lambda g: g[0])
|
||||
|
||||
# Identify margin gaps (first and last) vs interior gaps
|
||||
# A margin gap touches the edge of the content area (within 2% tolerance)
|
||||
edge_tolerance = max(10, int(content_w * 0.02))
|
||||
|
||||
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
||||
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
||||
|
||||
# Interior gaps define column boundaries
|
||||
# Column starts at the end of a gap, ends at the start of the next gap
|
||||
col_starts = []
|
||||
|
||||
if is_left_margin:
|
||||
# First column starts after the left margin gap
|
||||
first_gap_end = validated_gaps[0][1]
|
||||
interior_gaps = validated_gaps[1:]
|
||||
else:
|
||||
# No left margin gap — first column starts at content left edge
|
||||
first_gap_end = 0
|
||||
interior_gaps = validated_gaps[:]
|
||||
|
||||
if is_right_margin:
|
||||
# Last gap is right margin — don't use it as column start
|
||||
interior_gaps_for_boundaries = interior_gaps[:-1]
|
||||
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
||||
else:
|
||||
interior_gaps_for_boundaries = interior_gaps
|
||||
right_boundary = content_w
|
||||
|
||||
# First column
|
||||
col_starts.append(left_x + first_gap_end)
|
||||
|
||||
# Columns between interior gaps
|
||||
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
||||
col_starts.append(left_x + gap_end_rel)
|
||||
|
||||
# Count words per column region (for logging)
|
||||
col_start_counts = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
next_start = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to full image width (w).
|
||||
# The page margin contains only white space — extending the OCR
|
||||
# crop to the image edge is safe and prevents text near the right
|
||||
# border from being cut off.
|
||||
next_start = w
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = next_start - left_x
|
||||
n_words_in_col = sum(1 for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel)
|
||||
col_start_counts.append((start_x, n_words_in_col))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
||||
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
||||
f"{col_start_counts}")
|
||||
|
||||
# --- Step 8: Build ColumnGeometry objects ---
|
||||
# Determine right edge for each column
|
||||
all_boundaries = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
end_x = col_starts[i + 1]
|
||||
else:
|
||||
# Rightmost column always extends to full image width (w).
|
||||
end_x = w
|
||||
all_boundaries.append((start_x, end_x))
|
||||
|
||||
geometries = []
|
||||
for i, (start_x, end_x) in enumerate(all_boundaries):
|
||||
col_width = end_x - start_x
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
|
||||
geometries.append(ColumnGeometry(
|
||||
index=i,
|
||||
x=start_x,
|
||||
y=top_y,
|
||||
width=col_width,
|
||||
height=content_h,
|
||||
word_count=len(col_words),
|
||||
words=col_words,
|
||||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||||
))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
# --- Step 9: Filter phantom narrow columns ---
|
||||
# Tiny spurious gaps (e.g. 11px + 35px adjacent) can create very narrow
|
||||
# columns (< 3% of content width) with zero or no words. These are not
|
||||
# real columns — remove them and close the gap between neighbors.
|
||||
min_real_col_w = max(20, int(content_w * 0.03))
|
||||
filtered_geoms = [g for g in geometries
|
||||
if not (g.word_count < 3 and g.width < min_real_col_w)]
|
||||
if len(filtered_geoms) < len(geometries):
|
||||
n_removed = len(geometries) - len(filtered_geoms)
|
||||
logger.info(f"ColumnGeometry: removed {n_removed} phantom column(s) "
|
||||
f"(width < {min_real_col_w}px and words < 3)")
|
||||
# Extend each remaining column to close gaps with its right neighbor
|
||||
for i, g in enumerate(filtered_geoms):
|
||||
if i + 1 < len(filtered_geoms):
|
||||
g.width = filtered_geoms[i + 1].x - g.x
|
||||
else:
|
||||
g.width = w - g.x
|
||||
g.index = i
|
||||
col_left_rel = g.x - left_x
|
||||
col_right_rel = col_left_rel + g.width
|
||||
g.words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
g.word_count = len(g.words)
|
||||
geometries = filtered_geoms
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns after phantom filter: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
|
||||
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||
479
klausur-service/backend/cv_layout_detection.py
Normal file
479
klausur-service/backend/cv_layout_detection.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""
|
||||
Document type detection, image preparation, content bounds, and header/footer detection.
|
||||
|
||||
Extracted from cv_layout.py — these are the "input-side" helpers that run before
|
||||
column/row geometry analysis.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import (
|
||||
DocumentTypeResult,
|
||||
PageRegion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Document Type Detection
|
||||
# =============================================================================
|
||||
|
||||
def detect_document_type(ocr_img: np.ndarray, img_bgr: np.ndarray) -> DocumentTypeResult:
|
||||
"""Detect whether the page is a vocab table, generic table, or full text.
|
||||
|
||||
Uses projection profiles and text density analysis — no OCR required.
|
||||
Runs in < 2 seconds.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized grayscale image (for projection profiles).
|
||||
img_bgr: BGR color image.
|
||||
|
||||
Returns:
|
||||
DocumentTypeResult with doc_type, confidence, pipeline, skip_steps.
|
||||
"""
|
||||
if ocr_img is None or ocr_img.size == 0:
|
||||
return DocumentTypeResult(
|
||||
doc_type='full_text', confidence=0.5, pipeline='full_page',
|
||||
skip_steps=['columns', 'rows'],
|
||||
features={'error': 'empty image'},
|
||||
)
|
||||
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# --- 1. Vertical projection profile → detect column gaps ---
|
||||
# Sum dark pixels along each column (x-axis). Gaps = valleys in the profile.
|
||||
# Invert: dark pixels on white background → high values = text.
|
||||
vert_proj = np.sum(ocr_img < 128, axis=0).astype(float)
|
||||
|
||||
# Smooth the profile to avoid noise spikes
|
||||
kernel_size = max(3, w // 100)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
vert_smooth = np.convolve(vert_proj, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Find significant vertical gaps (columns of near-zero text density)
|
||||
# A gap must be at least 1% of image width and have < 5% of max density
|
||||
max_density = max(vert_smooth.max(), 1)
|
||||
gap_threshold = max_density * 0.05
|
||||
min_gap_width = max(5, w // 100)
|
||||
|
||||
in_gap = False
|
||||
gap_count = 0
|
||||
gap_start = 0
|
||||
vert_gaps = []
|
||||
|
||||
for x in range(w):
|
||||
if vert_smooth[x] < gap_threshold:
|
||||
if not in_gap:
|
||||
in_gap = True
|
||||
gap_start = x
|
||||
else:
|
||||
if in_gap:
|
||||
gap_width = x - gap_start
|
||||
if gap_width >= min_gap_width:
|
||||
gap_count += 1
|
||||
vert_gaps.append((gap_start, x, gap_width))
|
||||
in_gap = False
|
||||
|
||||
# Filter out margin gaps (within 10% of image edges)
|
||||
margin_threshold = w * 0.10
|
||||
internal_gaps = [g for g in vert_gaps if g[0] > margin_threshold and g[1] < w - margin_threshold]
|
||||
internal_gap_count = len(internal_gaps)
|
||||
|
||||
# --- 2. Horizontal projection profile → detect row gaps ---
|
||||
horiz_proj = np.sum(ocr_img < 128, axis=1).astype(float)
|
||||
h_kernel = max(3, h // 200)
|
||||
if h_kernel % 2 == 0:
|
||||
h_kernel += 1
|
||||
horiz_smooth = np.convolve(horiz_proj, np.ones(h_kernel) / h_kernel, mode='same')
|
||||
|
||||
h_max = max(horiz_smooth.max(), 1)
|
||||
h_gap_threshold = h_max * 0.05
|
||||
min_row_gap = max(3, h // 200)
|
||||
|
||||
row_gap_count = 0
|
||||
in_gap = False
|
||||
for y in range(h):
|
||||
if horiz_smooth[y] < h_gap_threshold:
|
||||
if not in_gap:
|
||||
in_gap = True
|
||||
gap_start = y
|
||||
else:
|
||||
if in_gap:
|
||||
if y - gap_start >= min_row_gap:
|
||||
row_gap_count += 1
|
||||
in_gap = False
|
||||
|
||||
# --- 3. Text density distribution (4×4 grid) ---
|
||||
grid_rows, grid_cols = 4, 4
|
||||
cell_h, cell_w = h // grid_rows, w // grid_cols
|
||||
densities = []
|
||||
for gr in range(grid_rows):
|
||||
for gc in range(grid_cols):
|
||||
cell = ocr_img[gr * cell_h:(gr + 1) * cell_h,
|
||||
gc * cell_w:(gc + 1) * cell_w]
|
||||
if cell.size > 0:
|
||||
d = float(np.count_nonzero(cell < 128)) / cell.size
|
||||
densities.append(d)
|
||||
|
||||
density_std = float(np.std(densities)) if densities else 0
|
||||
density_mean = float(np.mean(densities)) if densities else 0
|
||||
|
||||
features = {
|
||||
'vertical_gaps': gap_count,
|
||||
'internal_vertical_gaps': internal_gap_count,
|
||||
'vertical_gap_details': [(g[0], g[1], g[2]) for g in vert_gaps[:10]],
|
||||
'row_gaps': row_gap_count,
|
||||
'density_mean': round(density_mean, 4),
|
||||
'density_std': round(density_std, 4),
|
||||
'image_size': (w, h),
|
||||
}
|
||||
|
||||
# --- 4. Decision tree ---
|
||||
# Use internal_gap_count (excludes margin gaps) for column detection.
|
||||
if internal_gap_count >= 2 and row_gap_count >= 5:
|
||||
# Multiple internal vertical gaps + many row gaps → table
|
||||
confidence = min(0.95, 0.7 + internal_gap_count * 0.05 + row_gap_count * 0.005)
|
||||
return DocumentTypeResult(
|
||||
doc_type='vocab_table',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
elif internal_gap_count >= 1 and row_gap_count >= 3:
|
||||
# Some internal structure, likely a table
|
||||
confidence = min(0.85, 0.5 + internal_gap_count * 0.1 + row_gap_count * 0.01)
|
||||
return DocumentTypeResult(
|
||||
doc_type='generic_table',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
elif internal_gap_count == 0:
|
||||
# No internal column gaps → full text (regardless of density)
|
||||
confidence = min(0.95, 0.8 + (1 - min(density_std, 0.1)) * 0.15)
|
||||
return DocumentTypeResult(
|
||||
doc_type='full_text',
|
||||
confidence=round(confidence, 2),
|
||||
pipeline='full_page',
|
||||
skip_steps=['columns', 'rows'],
|
||||
features=features,
|
||||
)
|
||||
else:
|
||||
# Ambiguous — default to vocab_table (most common use case)
|
||||
return DocumentTypeResult(
|
||||
doc_type='vocab_table',
|
||||
confidence=0.5,
|
||||
pipeline='cell_first',
|
||||
skip_steps=[],
|
||||
features=features,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Image Creation (Dual Image Preparation)
|
||||
# =============================================================================
|
||||
|
||||
def create_ocr_image(img: np.ndarray) -> np.ndarray:
|
||||
"""Create a binarized image optimized for Tesseract OCR.
|
||||
|
||||
Steps: Grayscale → Background normalization → Adaptive threshold → Denoise.
|
||||
|
||||
Args:
|
||||
img: BGR image.
|
||||
|
||||
Returns:
|
||||
Binary image (white text on black background inverted to black on white).
|
||||
"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# Background normalization: divide by blurred version
|
||||
bg = cv2.GaussianBlur(gray, (51, 51), 0)
|
||||
normalized = cv2.divide(gray, bg, scale=255)
|
||||
|
||||
# Adaptive binarization
|
||||
binary = cv2.adaptiveThreshold(
|
||||
normalized, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||
cv2.THRESH_BINARY, 31, 10
|
||||
)
|
||||
|
||||
# Light denoise
|
||||
denoised = cv2.medianBlur(binary, 3)
|
||||
|
||||
return denoised
|
||||
|
||||
|
||||
def create_layout_image(img: np.ndarray) -> np.ndarray:
|
||||
"""Create a CLAHE-enhanced grayscale image for layout analysis.
|
||||
|
||||
Args:
|
||||
img: BGR image.
|
||||
|
||||
Returns:
|
||||
Enhanced grayscale image.
|
||||
"""
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
||||
enhanced = clahe.apply(gray)
|
||||
return enhanced
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Content Bounds Detection
|
||||
# =============================================================================
|
||||
|
||||
def _filter_narrow_runs(mask: np.ndarray, min_width: int) -> np.ndarray:
|
||||
"""Remove contiguous True-runs shorter than *min_width* from a 1-D bool mask."""
|
||||
out = mask.copy()
|
||||
n = len(out)
|
||||
i = 0
|
||||
while i < n:
|
||||
if out[i]:
|
||||
start = i
|
||||
while i < n and out[i]:
|
||||
i += 1
|
||||
if (i - start) < min_width:
|
||||
out[start:i] = False
|
||||
else:
|
||||
i += 1
|
||||
return out
|
||||
|
||||
|
||||
def _find_content_bounds(inv: np.ndarray) -> Tuple[int, int, int, int]:
|
||||
"""Find the bounding box of actual text content (excluding page margins).
|
||||
|
||||
Scan artefacts (thin black lines at page edges) are filtered out by
|
||||
discarding contiguous projection runs narrower than 1 % of the image
|
||||
dimension (min 5 px).
|
||||
|
||||
Returns:
|
||||
Tuple of (left_x, right_x, top_y, bottom_y).
|
||||
"""
|
||||
h, w = inv.shape[:2]
|
||||
threshold = 0.005
|
||||
|
||||
# --- Horizontal projection for top/bottom ---
|
||||
h_proj = np.sum(inv, axis=1).astype(float) / (w * 255)
|
||||
h_mask = h_proj > threshold
|
||||
min_h_run = max(5, h // 100)
|
||||
h_mask = _filter_narrow_runs(h_mask, min_h_run)
|
||||
|
||||
top_y = 0
|
||||
for y in range(h):
|
||||
if h_mask[y]:
|
||||
top_y = max(0, y - 5)
|
||||
break
|
||||
|
||||
bottom_y = h
|
||||
for y in range(h - 1, 0, -1):
|
||||
if h_mask[y]:
|
||||
bottom_y = min(h, y + 5)
|
||||
break
|
||||
|
||||
# --- Vertical projection for left/right margins ---
|
||||
v_proj = np.sum(inv[top_y:bottom_y, :], axis=0).astype(float)
|
||||
v_proj_norm = v_proj / ((bottom_y - top_y) * 255) if (bottom_y - top_y) > 0 else v_proj
|
||||
v_mask = v_proj_norm > threshold
|
||||
min_v_run = max(5, w // 100)
|
||||
v_mask = _filter_narrow_runs(v_mask, min_v_run)
|
||||
|
||||
left_x = 0
|
||||
for x in range(w):
|
||||
if v_mask[x]:
|
||||
left_x = max(0, x - 2)
|
||||
break
|
||||
|
||||
right_x = w
|
||||
for x in range(w - 1, 0, -1):
|
||||
if v_mask[x]:
|
||||
right_x = min(w, x + 2)
|
||||
break
|
||||
|
||||
return left_x, right_x, top_y, bottom_y
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Header / Footer Detection
|
||||
# =============================================================================
|
||||
|
||||
def _detect_header_footer_gaps(
|
||||
inv: np.ndarray,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> Tuple[Optional[int], Optional[int]]:
|
||||
"""Detect header/footer boundaries via horizontal projection gap analysis.
|
||||
|
||||
Scans the full-page inverted image for large horizontal gaps in the top/bottom
|
||||
20% that separate header/footer content from the main body.
|
||||
|
||||
Returns:
|
||||
(header_y, footer_y) — absolute y-coordinates.
|
||||
header_y = bottom edge of header region (None if no header detected).
|
||||
footer_y = top edge of footer region (None if no footer detected).
|
||||
"""
|
||||
HEADER_FOOTER_ZONE = 0.20
|
||||
GAP_MULTIPLIER = 2.0
|
||||
|
||||
# Step 1: Horizontal projection — clamp to img_h to avoid dewarp padding
|
||||
actual_h = min(inv.shape[0], img_h)
|
||||
roi = inv[:actual_h, :]
|
||||
h_proj = np.sum(roi, axis=1).astype(float)
|
||||
proj_w = roi.shape[1]
|
||||
h_proj_norm = h_proj / (proj_w * 255) if proj_w > 0 else h_proj
|
||||
|
||||
# Step 2: Smoothing
|
||||
kernel_size = max(3, actual_h // 200)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Step 3: Gap threshold
|
||||
positive = h_smooth[h_smooth > 0]
|
||||
median_density = float(np.median(positive)) if len(positive) > 0 else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.003)
|
||||
|
||||
in_gap = h_smooth < gap_threshold
|
||||
MIN_GAP_HEIGHT = max(3, actual_h // 500)
|
||||
|
||||
# Step 4: Collect contiguous gaps
|
||||
raw_gaps: List[Tuple[int, int]] = []
|
||||
gap_start: Optional[int] = None
|
||||
for y in range(len(in_gap)):
|
||||
if in_gap[y]:
|
||||
if gap_start is None:
|
||||
gap_start = y
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_height = y - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, y))
|
||||
gap_start = None
|
||||
if gap_start is not None:
|
||||
gap_height = len(in_gap) - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
if not raw_gaps:
|
||||
return None, None
|
||||
|
||||
# Step 5: Compute median gap size and large-gap threshold
|
||||
gap_sizes = [g[1] - g[0] for g in raw_gaps]
|
||||
median_gap = float(np.median(gap_sizes))
|
||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||
|
||||
# Step 6: Find largest qualifying gap in header / footer zones
|
||||
# A separator gap must have content on BOTH sides — edge-touching gaps
|
||||
# (e.g. dewarp padding at bottom) are not valid separators.
|
||||
EDGE_MARGIN = max(5, actual_h // 400)
|
||||
header_zone_limit = int(actual_h * HEADER_FOOTER_ZONE)
|
||||
footer_zone_start = int(actual_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||
|
||||
header_y: Optional[int] = None
|
||||
footer_y: Optional[int] = None
|
||||
|
||||
best_header_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if gs <= EDGE_MARGIN:
|
||||
continue # skip gaps touching the top edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||
if gap_size > best_header_size:
|
||||
best_header_size = gap_size
|
||||
header_y = ge # bottom edge of gap
|
||||
|
||||
best_footer_size = 0
|
||||
for gs, ge in raw_gaps:
|
||||
if ge >= actual_h - EDGE_MARGIN:
|
||||
continue # skip gaps touching the bottom edge
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||
if gap_size > best_footer_size:
|
||||
best_footer_size = gap_size
|
||||
footer_y = gs # top edge of gap
|
||||
|
||||
if header_y is not None:
|
||||
logger.info(f"HeaderFooterGaps: header boundary at y={header_y} "
|
||||
f"(gap={best_header_size}px, median_gap={median_gap:.0f}px)")
|
||||
if footer_y is not None:
|
||||
logger.info(f"HeaderFooterGaps: footer boundary at y={footer_y} "
|
||||
f"(gap={best_footer_size}px, median_gap={median_gap:.0f}px)")
|
||||
|
||||
return header_y, footer_y
|
||||
|
||||
|
||||
def _region_has_content(inv: np.ndarray, y_start: int, y_end: int,
|
||||
min_density: float = 0.005) -> bool:
|
||||
"""Check whether a horizontal strip contains meaningful ink.
|
||||
|
||||
Args:
|
||||
inv: Inverted binarized image (white-on-black).
|
||||
y_start: Top of the region (inclusive).
|
||||
y_end: Bottom of the region (exclusive).
|
||||
min_density: Fraction of white pixels required to count as content.
|
||||
|
||||
Returns:
|
||||
True if the region contains text/graphics, False if empty margin.
|
||||
"""
|
||||
if y_start >= y_end:
|
||||
return False
|
||||
strip = inv[y_start:y_end, :]
|
||||
density = float(np.sum(strip)) / (strip.shape[0] * strip.shape[1] * 255)
|
||||
return density > min_density
|
||||
|
||||
|
||||
def _add_header_footer(regions: List[PageRegion], top_y: int, bottom_y: int,
|
||||
img_w: int, img_h: int,
|
||||
inv: Optional[np.ndarray] = None) -> None:
|
||||
"""Add header/footer/margin regions in-place.
|
||||
|
||||
Uses gap-based detection when *inv* is provided, otherwise falls back
|
||||
to simple top_y/bottom_y bounds.
|
||||
|
||||
Region types depend on whether there is actual content (text/graphics):
|
||||
- 'header' / 'footer' — region contains text (e.g. title, page number)
|
||||
- 'margin_top' / 'margin_bottom' — region is empty page margin
|
||||
"""
|
||||
header_y: Optional[int] = None
|
||||
footer_y: Optional[int] = None
|
||||
|
||||
if inv is not None:
|
||||
header_y, footer_y = _detect_header_footer_gaps(inv, img_w, img_h)
|
||||
|
||||
# --- Top region ---
|
||||
top_boundary = header_y if header_y is not None and header_y > 10 else (
|
||||
top_y if top_y > 10 else None
|
||||
)
|
||||
if top_boundary is not None:
|
||||
has_content = inv is not None and _region_has_content(inv, 0, top_boundary)
|
||||
rtype = 'header' if has_content else 'margin_top'
|
||||
regions.append(PageRegion(type=rtype, x=0, y=0, width=img_w, height=top_boundary))
|
||||
logger.info(f"HeaderFooter: top region type={rtype} height={top_boundary}px "
|
||||
f"(has_content={has_content})")
|
||||
|
||||
# --- Bottom region ---
|
||||
bottom_boundary = footer_y if footer_y is not None and footer_y < img_h - 10 else (
|
||||
bottom_y if bottom_y < img_h - 10 else None
|
||||
)
|
||||
if bottom_boundary is not None:
|
||||
has_content = inv is not None and _region_has_content(inv, bottom_boundary, img_h)
|
||||
rtype = 'footer' if has_content else 'margin_bottom'
|
||||
regions.append(PageRegion(type=rtype, x=0, y=bottom_boundary, width=img_w,
|
||||
height=img_h - bottom_boundary))
|
||||
logger.info(f"HeaderFooter: bottom region type={rtype} y={bottom_boundary} "
|
||||
f"height={img_h - bottom_boundary}px (has_content={has_content})")
|
||||
329
klausur-service/backend/cv_layout_row_regularize.py
Normal file
329
klausur-service/backend/cv_layout_row_regularize.py
Normal file
@@ -0,0 +1,329 @@
|
||||
"""
|
||||
Row grid regularization for document layout analysis.
|
||||
|
||||
Provides word-center-based row boundary refinement to improve
|
||||
gap-based row detection. Extracted from cv_layout_rows.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _regularize_row_grid(
|
||||
rows: List['RowGeometry'],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int,
|
||||
content_w: int, content_h: int,
|
||||
inv: np.ndarray,
|
||||
) -> List['RowGeometry']:
|
||||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||||
|
||||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||||
from the words themselves:
|
||||
|
||||
Step A: Group all content words into line clusters by Y-proximity.
|
||||
Tolerance = 40% of median gap-based row height.
|
||||
Step B: For each cluster compute:
|
||||
- center_y = median of (word_top + word_height/2) for all words
|
||||
- letter_h = median of word heights (excluding outliers > 2× median)
|
||||
Step B2: Merge clusters whose centers are closer than 30% of row height
|
||||
(spurious splits from OCR jitter).
|
||||
Step C: Compute pitches (distances between consecutive centers).
|
||||
Detect section breaks where gap > 1.8× median pitch.
|
||||
Step D: Split clusters into sections at the section breaks.
|
||||
Step E: Within each section, place row boundaries at midpoints between
|
||||
consecutive line centers:
|
||||
- First row top = center - local_pitch/2
|
||||
- Last row bottom = center + local_pitch/2
|
||||
- Interior boundaries = (center_i + center_{i+1}) / 2
|
||||
This ensures rows tile seamlessly without gaps or overlaps.
|
||||
Step F: Re-assign words to the nearest grid row by vertical center distance.
|
||||
Step G: Validate that >= 85% of words land in a grid row; otherwise
|
||||
fall back to the original gap-based rows.
|
||||
Step H: Merge with preserved header/footer rows and re-index.
|
||||
|
||||
Guard: Requires >= 5 content rows from gap-based detection to activate.
|
||||
This prevents the regularizer from running on very small images (e.g.
|
||||
box sub-sessions with only 3-6 rows) where the gap-based detection
|
||||
is already accurate enough.
|
||||
|
||||
Header/footer rows from the gap-based detection are preserved.
|
||||
"""
|
||||
content_rows = [r for r in rows if r.row_type == 'content']
|
||||
non_content = [r for r in rows if r.row_type != 'content']
|
||||
|
||||
if len(content_rows) < 5:
|
||||
return rows
|
||||
|
||||
# --- Step A: Group ALL words into line clusters ---
|
||||
# Collect words that belong to content rows (deduplicated)
|
||||
content_words: List[Dict] = []
|
||||
seen_keys: set = set()
|
||||
for r in content_rows:
|
||||
for w in r.words:
|
||||
key = (w['left'], w['top'], w['width'], w['height'])
|
||||
if key not in seen_keys:
|
||||
seen_keys.add(key)
|
||||
content_words.append(w)
|
||||
|
||||
if len(content_words) < 5:
|
||||
return rows
|
||||
|
||||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||||
word_heights = sorted(w['height'] for w in content_words)
|
||||
median_wh = word_heights[len(word_heights) // 2]
|
||||
|
||||
# Compute median gap-based row height — this is the actual line height
|
||||
# as detected by the horizontal projection. We use 40% of this as
|
||||
# grouping tolerance. This is much more reliable than using word height
|
||||
# alone, because words on the same line can have very different heights
|
||||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||||
gap_row_heights = sorted(r.height for r in content_rows)
|
||||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||||
|
||||
# Tolerance: 40% of row height. Words on the same line should have
|
||||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||||
# its center should stay within half a row height of the line center.
|
||||
y_tol = max(10, int(median_row_h * 0.4))
|
||||
|
||||
# Sort by center_y, then group by proximity
|
||||
words_by_center = sorted(content_words,
|
||||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||||
line_clusters: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [words_by_center[0]]
|
||||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||||
|
||||
for w in words_by_center[1:]:
|
||||
w_center = w['top'] + w['height'] / 2
|
||||
if abs(w_center - current_center) <= y_tol:
|
||||
current_line.append(w)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
current_line = [w]
|
||||
current_center = w_center
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
|
||||
if len(line_clusters) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step B: Compute center_y per cluster ---
|
||||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||||
# letter_h = median of word heights, but excluding outlier-height words
|
||||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||||
cluster_info: List[Dict] = []
|
||||
for cl_words in line_clusters:
|
||||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||||
# Filter outlier heights for letter_h computation
|
||||
normal_heights = [w['height'] for w in cl_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in cl_words]
|
||||
center_y = float(np.median(centers))
|
||||
letter_h = float(np.median(normal_heights))
|
||||
cluster_info.append({
|
||||
'center_y_rel': center_y, # relative to content ROI
|
||||
'center_y_abs': center_y + top_y, # absolute
|
||||
'letter_h': letter_h,
|
||||
'words': cl_words,
|
||||
})
|
||||
|
||||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||||
|
||||
# --- Step B2: Merge clusters that are too close together ---
|
||||
# Even with center-based grouping, some edge cases can produce
|
||||
# spurious clusters. Merge any pair whose centers are closer
|
||||
# than 30% of the row height (they're definitely the same text line).
|
||||
merge_threshold = max(8, median_row_h * 0.3)
|
||||
merged: List[Dict] = [cluster_info[0]]
|
||||
for cl in cluster_info[1:]:
|
||||
prev = merged[-1]
|
||||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||||
# Merge: combine words, recompute center
|
||||
combined_words = prev['words'] + cl['words']
|
||||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||||
normal_heights = [w['height'] for w in combined_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in combined_words]
|
||||
prev['center_y_rel'] = float(np.median(centers))
|
||||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||||
prev['letter_h'] = float(np.median(normal_heights))
|
||||
prev['words'] = combined_words
|
||||
else:
|
||||
merged.append(cl)
|
||||
|
||||
cluster_info = merged
|
||||
|
||||
if len(cluster_info) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step C: Compute pitches and detect section breaks ---
|
||||
pitches: List[float] = []
|
||||
for i in range(1, len(cluster_info)):
|
||||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
pitches.append(pitch)
|
||||
|
||||
if not pitches:
|
||||
return rows
|
||||
|
||||
median_pitch = float(np.median(pitches))
|
||||
if median_pitch <= 5:
|
||||
return rows
|
||||
|
||||
# A section break is where the gap between line centers is much larger
|
||||
# than the normal pitch (sub-headings, section titles, etc.)
|
||||
BREAK_FACTOR = 1.8
|
||||
|
||||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||||
sections: List[List[Dict]] = []
|
||||
current_section: List[Dict] = [cluster_info[0]]
|
||||
|
||||
for i in range(1, len(cluster_info)):
|
||||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
if gap > median_pitch * BREAK_FACTOR:
|
||||
sections.append(current_section)
|
||||
current_section = [cluster_info[i]]
|
||||
else:
|
||||
current_section.append(cluster_info[i])
|
||||
|
||||
if current_section:
|
||||
sections.append(current_section)
|
||||
|
||||
# --- Step E: Build row boundaries per section ---
|
||||
grid_rows: List[RowGeometry] = []
|
||||
|
||||
for section in sections:
|
||||
if not section:
|
||||
continue
|
||||
|
||||
if len(section) == 1:
|
||||
# Single-line section (likely a heading)
|
||||
cl = section[0]
|
||||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||||
row_top = cl['center_y_abs'] - half_h
|
||||
row_bot = cl['center_y_abs'] + half_h
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
continue
|
||||
|
||||
# Compute local pitch for this section
|
||||
local_pitches = []
|
||||
for i in range(1, len(section)):
|
||||
local_pitches.append(
|
||||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||||
)
|
||||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||||
|
||||
# Row boundaries are placed at midpoints between consecutive centers.
|
||||
# First row: top = center - local_pitch/2
|
||||
# Last row: bottom = center + local_pitch/2
|
||||
for i, cl in enumerate(section):
|
||||
if i == 0:
|
||||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||||
else:
|
||||
# Midpoint between this center and previous center
|
||||
prev_center = section[i - 1]['center_y_abs']
|
||||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||||
|
||||
if i == len(section) - 1:
|
||||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||||
else:
|
||||
next_center = section[i + 1]['center_y_abs']
|
||||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||||
|
||||
# Clamp to reasonable bounds
|
||||
row_top = max(top_y, row_top)
|
||||
row_bot = min(top_y + content_h, row_bot)
|
||||
|
||||
if row_bot - row_top < 5:
|
||||
continue
|
||||
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
|
||||
if not grid_rows:
|
||||
return rows
|
||||
|
||||
# --- Step F: Re-assign words to grid rows ---
|
||||
# Words may have shifted slightly; assign each word to the row whose
|
||||
# center is closest to the word's vertical center.
|
||||
for gr in grid_rows:
|
||||
gr.words = []
|
||||
|
||||
for w in content_words:
|
||||
w_center = w['top'] + top_y + w['height'] / 2
|
||||
best_row = None
|
||||
best_dist = float('inf')
|
||||
for gr in grid_rows:
|
||||
row_center = gr.y + gr.height / 2
|
||||
dist = abs(w_center - row_center)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_row = gr
|
||||
if best_row is not None and best_dist < median_pitch:
|
||||
best_row.words.append(w)
|
||||
|
||||
for gr in grid_rows:
|
||||
gr.word_count = len(gr.words)
|
||||
|
||||
# --- Step G: Validate ---
|
||||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||||
if len(content_words) > 0:
|
||||
match_ratio = words_placed / len(content_words)
|
||||
if match_ratio < 0.85:
|
||||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||||
f"of words, keeping gap-based rows")
|
||||
return rows
|
||||
|
||||
# Remove empty grid rows (no words assigned)
|
||||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||||
|
||||
# --- Step H: Merge header/footer + re-index ---
|
||||
result = list(non_content) + grid_rows
|
||||
result.sort(key=lambda r: r.y)
|
||||
for i, r in enumerate(result):
|
||||
r.index = i
|
||||
|
||||
row_heights = [gr.height for gr in grid_rows]
|
||||
min_h = min(row_heights) if row_heights else 0
|
||||
max_h = max(row_heights) if row_heights else 0
|
||||
logger.info(f"RowGrid: word-center grid applied "
|
||||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||||
f"{len(sections)} sections, "
|
||||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||||
f"was {len(content_rows)} gap-based rows)")
|
||||
|
||||
return result
|
||||
352
klausur-service/backend/cv_layout_rows.py
Normal file
352
klausur-service/backend/cv_layout_rows.py
Normal file
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
Row geometry detection for document layout analysis.
|
||||
|
||||
Provides horizontal whitespace-gap analysis to detect text rows,
|
||||
word-center grid regularization, and fallback word-grouping.
|
||||
|
||||
Extracted from cv_layout.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
from cv_ocr_word_assembly import _group_words_into_lines
|
||||
from cv_layout_row_regularize import _regularize_row_grid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||||
# =============================================================================
|
||||
|
||||
def detect_row_geometry(
|
||||
inv: np.ndarray,
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int, bottom_y: int,
|
||||
) -> List['RowGeometry']:
|
||||
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||||
|
||||
Algorithm overview (two phases):
|
||||
|
||||
Phase 1 — Gap-based detection (Steps 1–6):
|
||||
1. Build a horizontal projection profile: for each y-pixel, sum the
|
||||
ink density across the content width. Only pixels within/near
|
||||
Tesseract word bounding boxes contribute (word_mask), so that
|
||||
images/illustrations don't merge adjacent text rows.
|
||||
2. Smooth the projection and find contiguous regions below a
|
||||
threshold (= gaps / horizontal whitespace between text lines).
|
||||
The threshold is 15% of the median non-zero density.
|
||||
3. Validate gaps against word bounding boxes — discard any gap
|
||||
that overlaps a word, or shift the gap boundary to avoid the word.
|
||||
4. Build rows from the spans between validated gaps.
|
||||
5. Detect header/footer rows: gaps in the top/bottom 15% of the
|
||||
page that are >= 2× the median gap size mark section boundaries.
|
||||
|
||||
Phase 2 — Word-center regularization (_regularize_row_grid, Step 7):
|
||||
For each word, compute its vertical center (top + height/2).
|
||||
Group words into line clusters by Y-proximity (tolerance = 40% of
|
||||
the median gap-based row height).
|
||||
For each cluster, the line center = median of all word centers.
|
||||
The "pitch" = distance between consecutive line centers.
|
||||
Section breaks are detected where the pitch exceeds 1.8× the median.
|
||||
Within each section, row boundaries are placed at the midpoints
|
||||
between consecutive line centers:
|
||||
- Row top = midpoint to previous line center (or center - pitch/2 for first)
|
||||
- Row bottom = midpoint to next line center (or center + pitch/2 for last)
|
||||
This ensures rows tile without gaps or overlaps.
|
||||
|
||||
Fallback:
|
||||
If < 2 gaps are found (very dense or uniform text), falls back to
|
||||
_build_rows_from_word_grouping() which groups words by Y proximity.
|
||||
|
||||
Args:
|
||||
inv: Inverted binarized image (white text on black bg, full page).
|
||||
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||||
left_x, right_x: Absolute X bounds of the content area.
|
||||
top_y, bottom_y: Absolute Y bounds of the content area.
|
||||
|
||||
Returns:
|
||||
List of RowGeometry objects sorted top to bottom.
|
||||
"""
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
if content_h < 10 or content_w < 10:
|
||||
logger.warning("detect_row_geometry: content area too small")
|
||||
return []
|
||||
|
||||
# --- Step 1: Horizontal projection profile ---
|
||||
# For each y-pixel row, sum ink density across the content width.
|
||||
# A word-coverage mask ensures only pixels near Tesseract words contribute,
|
||||
# so that illustrations/images don't inflate the density and merge rows.
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
WORD_PAD_Y = max(4, content_h // 300) # small vertical padding around words
|
||||
word_mask = np.zeros((content_h, content_w), dtype=np.uint8)
|
||||
for wd in word_dicts:
|
||||
y1 = max(0, wd['top'] - WORD_PAD_Y)
|
||||
y2 = min(content_h, wd['top'] + wd['height'] + WORD_PAD_Y)
|
||||
x1 = max(0, wd['left'])
|
||||
x2 = min(content_w, wd['left'] + wd['width'])
|
||||
word_mask[y1:y2, x1:x2] = 255
|
||||
|
||||
masked_strip = cv2.bitwise_and(content_strip, word_mask)
|
||||
h_proj = np.sum(masked_strip, axis=1).astype(float)
|
||||
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||||
|
||||
# --- Step 2: Smoothing + gap threshold ---
|
||||
# Smooth the projection to reduce noise, then threshold at 15% of the
|
||||
# median non-zero density. Pixels below this threshold are considered
|
||||
# "gap" (horizontal whitespace between text lines).
|
||||
# MIN_GAP_HEIGHT prevents tiny noise gaps from splitting rows.
|
||||
kernel_size = max(3, content_h // 200)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.003)
|
||||
|
||||
in_gap = h_smooth < gap_threshold
|
||||
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||||
|
||||
# --- Step 3: Collect contiguous gap regions ---
|
||||
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||||
gap_start = None
|
||||
for y in range(len(in_gap)):
|
||||
if in_gap[y]:
|
||||
if gap_start is None:
|
||||
gap_start = y
|
||||
else:
|
||||
if gap_start is not None:
|
||||
gap_height = y - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, y))
|
||||
gap_start = None
|
||||
if gap_start is not None:
|
||||
gap_height = len(in_gap) - gap_start
|
||||
if gap_height >= MIN_GAP_HEIGHT:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||
f"min_height={MIN_GAP_HEIGHT}px)")
|
||||
|
||||
# --- Step 4: Validate gaps against word bounding boxes ---
|
||||
# A gap is valid only if no word's bounding box overlaps it vertically.
|
||||
# If a word overlaps, try to shift the gap boundary above or below the
|
||||
# word. If neither shift yields enough room (>= MIN_GAP_HEIGHT), discard.
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
overlapping = False
|
||||
for wd in word_dicts:
|
||||
word_top = wd['top']
|
||||
word_bottom = wd['top'] + wd['height']
|
||||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||
overlapping = True
|
||||
break
|
||||
|
||||
if not overlapping:
|
||||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||
else:
|
||||
# Try to shift the gap to avoid overlapping words
|
||||
min_word_top = content_h
|
||||
max_word_bottom = 0
|
||||
for wd in word_dicts:
|
||||
word_top = wd['top']
|
||||
word_bottom = wd['top'] + wd['height']
|
||||
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||
min_word_top = min(min_word_top, word_top)
|
||||
max_word_bottom = max(max_word_bottom, word_bottom)
|
||||
|
||||
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||||
validated_gaps.append((gap_start_rel, min_word_top))
|
||||
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||||
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||||
else:
|
||||
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"discarded (word overlap, no room to shift)")
|
||||
|
||||
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||||
|
||||
# --- Fallback if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||||
return _build_rows_from_word_grouping(
|
||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||||
)
|
||||
|
||||
validated_gaps.sort(key=lambda g: g[0])
|
||||
|
||||
# --- Step 5: Header/footer detection via gap size ---
|
||||
HEADER_FOOTER_ZONE = 0.15
|
||||
GAP_MULTIPLIER = 2.0
|
||||
|
||||
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||||
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||||
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||
|
||||
header_boundary_rel = None # y below which is header
|
||||
footer_boundary_rel = None # y above which is footer
|
||||
|
||||
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||||
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||
|
||||
# Find largest gap in header zone
|
||||
best_header_gap = None
|
||||
for gs, ge in validated_gaps:
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||||
best_header_gap = (gs, ge)
|
||||
|
||||
if best_header_gap is not None:
|
||||
header_boundary_rel = best_header_gap[1]
|
||||
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||||
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||||
f"median_gap={median_gap:.0f}px)")
|
||||
|
||||
# Find largest gap in footer zone
|
||||
best_footer_gap = None
|
||||
for gs, ge in validated_gaps:
|
||||
gap_mid = (gs + ge) / 2
|
||||
gap_size = ge - gs
|
||||
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||||
best_footer_gap = (gs, ge)
|
||||
|
||||
if best_footer_gap is not None:
|
||||
footer_boundary_rel = best_footer_gap[0]
|
||||
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||||
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||||
|
||||
# --- Step 6: Build RowGeometry objects from gaps ---
|
||||
# Rows are the spans between consecutive gaps. The gap midpoints define
|
||||
# where one row ends and the next begins. Each row's height extends
|
||||
# from the end of the previous gap to the start of the next gap.
|
||||
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||||
|
||||
# Top of content to first gap
|
||||
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||||
row_boundaries.append((0, validated_gaps[0][0]))
|
||||
|
||||
# Between gaps
|
||||
for i in range(len(validated_gaps) - 1):
|
||||
row_start = validated_gaps[i][1]
|
||||
row_end = validated_gaps[i + 1][0]
|
||||
if row_end - row_start > 0:
|
||||
row_boundaries.append((row_start, row_end))
|
||||
|
||||
# Last gap to bottom of content
|
||||
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||||
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||||
|
||||
rows = []
|
||||
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||||
# Determine row type
|
||||
row_mid = (row_start_rel + row_end_rel) / 2
|
||||
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||||
row_type = 'header'
|
||||
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||||
row_type = 'footer'
|
||||
else:
|
||||
row_type = 'content'
|
||||
|
||||
# Collect words in this row
|
||||
row_words = [w for w in word_dicts
|
||||
if w['top'] + w['height'] / 2 >= row_start_rel
|
||||
and w['top'] + w['height'] / 2 < row_end_rel]
|
||||
|
||||
# Gap before this row
|
||||
gap_before = 0
|
||||
if idx == 0 and validated_gaps[0][0] > 0:
|
||||
gap_before = validated_gaps[0][0]
|
||||
elif idx > 0:
|
||||
# Find the gap just before this row boundary
|
||||
for gs, ge in validated_gaps:
|
||||
if ge == row_start_rel:
|
||||
gap_before = ge - gs
|
||||
break
|
||||
|
||||
rows.append(RowGeometry(
|
||||
index=idx,
|
||||
x=left_x,
|
||||
y=top_y + row_start_rel,
|
||||
width=content_w,
|
||||
height=row_end_rel - row_start_rel,
|
||||
word_count=len(row_words),
|
||||
words=row_words,
|
||||
row_type=row_type,
|
||||
gap_before=gap_before,
|
||||
))
|
||||
|
||||
# --- Step 7: Word-center grid regularization ---
|
||||
# Refine the gap-based rows using word vertical centers. For each word,
|
||||
# compute center_y = top + height/2. Group into line clusters, compute
|
||||
# the pitch (distance between consecutive line centers), and place row
|
||||
# boundaries at the midpoints between centers. This gives more precise
|
||||
# and evenly-spaced rows than the gap-based approach alone.
|
||||
# Also detects section breaks (headings, paragraphs) where the pitch
|
||||
# exceeds 1.8× the median, and handles each section independently.
|
||||
rows = _regularize_row_grid(rows, word_dicts, left_x, right_x, top_y,
|
||||
content_w, content_h, inv)
|
||||
|
||||
type_counts = {}
|
||||
for r in rows:
|
||||
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||||
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def _build_rows_from_word_grouping(
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int, bottom_y: int,
|
||||
content_w: int, content_h: int,
|
||||
) -> List['RowGeometry']:
|
||||
"""Fallback: build rows by grouping words by Y position.
|
||||
|
||||
Uses _group_words_into_lines() with a generous tolerance.
|
||||
No header/footer detection in fallback mode.
|
||||
"""
|
||||
if not word_dicts:
|
||||
return []
|
||||
|
||||
y_tolerance = max(20, content_h // 100)
|
||||
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||||
|
||||
rows = []
|
||||
for idx, line_words in enumerate(lines):
|
||||
if not line_words:
|
||||
continue
|
||||
min_top = min(w['top'] for w in line_words)
|
||||
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||||
row_height = max_bottom - min_top
|
||||
|
||||
rows.append(RowGeometry(
|
||||
index=idx,
|
||||
x=left_x,
|
||||
y=top_y + min_top,
|
||||
width=content_w,
|
||||
height=row_height,
|
||||
word_count=len(line_words),
|
||||
words=line_words,
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
|
||||
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||||
return rows
|
||||
441
klausur-service/backend/cv_layout_scoring.py
Normal file
441
klausur-service/backend/cv_layout_scoring.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Language scoring, role scoring, and dictionary detection/classification.
|
||||
|
||||
Extracted from cv_layout.py to keep modules under 500 LOC.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
ColumnGeometry,
|
||||
ENGLISH_FUNCTION_WORDS,
|
||||
GERMAN_FUNCTION_WORDS,
|
||||
PageRegion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Dictionary / Wörterbuch Detection ---
|
||||
|
||||
# Article words that appear as a dedicated column in dictionaries
|
||||
_DICT_ARTICLE_WORDS = {
|
||||
# German articles
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
|
||||
# English articles / infinitive marker
|
||||
"the", "a", "an", "to",
|
||||
}
|
||||
|
||||
|
||||
# --- Phase B: Content-Based Classification ---
|
||||
|
||||
def _score_language(words: List[Dict]) -> Dict[str, float]:
|
||||
"""Score the language of a column's words.
|
||||
|
||||
Analyzes function words, umlauts, and capitalization patterns
|
||||
to determine whether text is English or German.
|
||||
|
||||
Args:
|
||||
words: List of word dicts with 'text' and 'conf' keys.
|
||||
|
||||
Returns:
|
||||
Dict with 'eng' and 'deu' scores (0.0-1.0).
|
||||
"""
|
||||
if not words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
# Only consider words with decent confidence
|
||||
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
|
||||
if not good_words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
total = len(good_words)
|
||||
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
|
||||
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
|
||||
|
||||
# Check for umlauts (strong German signal)
|
||||
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
|
||||
umlaut_count = sum(1 for t in raw_texts
|
||||
for c in t if c in 'äöüÄÖÜß')
|
||||
|
||||
# German capitalization: nouns are capitalized mid-sentence
|
||||
# Count words that start with uppercase but aren't at position 0
|
||||
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
|
||||
|
||||
en_score = en_hits / total if total > 0 else 0.0
|
||||
de_score = de_hits / total if total > 0 else 0.0
|
||||
|
||||
# Boost German score for umlauts
|
||||
if umlaut_count > 0:
|
||||
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
|
||||
|
||||
# Boost German score for high capitalization ratio (typical for German nouns)
|
||||
if total > 5:
|
||||
cap_ratio = cap_words / total
|
||||
if cap_ratio > 0.3:
|
||||
de_score = min(1.0, de_score + 0.1)
|
||||
|
||||
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
|
||||
|
||||
|
||||
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||||
"""Score the role of a column based on its geometry and content patterns.
|
||||
|
||||
Args:
|
||||
geom: ColumnGeometry with words and dimensions.
|
||||
|
||||
Returns:
|
||||
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
|
||||
"""
|
||||
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
|
||||
|
||||
if not geom.words:
|
||||
return scores
|
||||
|
||||
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
|
||||
if not texts:
|
||||
return scores
|
||||
|
||||
avg_word_len = sum(len(t) for t in texts) / len(texts)
|
||||
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
|
||||
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
|
||||
digit_ratio = digit_words / len(texts) if texts else 0.0
|
||||
|
||||
# Reference: narrow + mostly numbers/page references
|
||||
if geom.width_ratio < 0.12:
|
||||
scores['reference'] = 0.5
|
||||
if digit_ratio > 0.4:
|
||||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||||
|
||||
# Marker: narrow + few short entries
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
scores['marker'] = 0.7
|
||||
if avg_word_len < 4:
|
||||
scores['marker'] = 0.9
|
||||
# Very narrow non-edge column → strong marker regardless of word count
|
||||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||||
scores['marker'] = max(scores['marker'], 0.9)
|
||||
|
||||
# Sentence: longer words + punctuation present
|
||||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||||
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
|
||||
if avg_word_len > 4:
|
||||
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
|
||||
|
||||
# Vocabulary: medium width + medium word length
|
||||
if 0.10 < geom.width_ratio < 0.45:
|
||||
scores['vocabulary'] = 0.4
|
||||
if 3 < avg_word_len < 8:
|
||||
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
|
||||
|
||||
return {k: round(v, 3) for k, v in scores.items()}
|
||||
|
||||
|
||||
def _score_dictionary_signals(
|
||||
geometries: List[ColumnGeometry],
|
||||
document_category: Optional[str] = None,
|
||||
margin_strip_detected: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Score dictionary-specific patterns across all columns.
|
||||
|
||||
Combines 4 independent signals to determine if the page is a dictionary:
|
||||
1. Alphabetical ordering of words in each column
|
||||
2. Article column detection (der/die/das, to)
|
||||
3. First-letter uniformity (most headwords share a letter)
|
||||
4. Decorative A-Z margin strip (detected upstream)
|
||||
|
||||
Args:
|
||||
geometries: List of ColumnGeometry with words.
|
||||
document_category: User-selected category (e.g. 'woerterbuch').
|
||||
margin_strip_detected: Whether a decorative A-Z margin strip was found.
|
||||
|
||||
Returns:
|
||||
Dict with 'is_dictionary', 'confidence', 'article_col_index',
|
||||
'headword_col_index', and 'signals' sub-dict.
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"is_dictionary": False,
|
||||
"confidence": 0.0,
|
||||
"article_col_index": None,
|
||||
"headword_col_index": None,
|
||||
"signals": {},
|
||||
}
|
||||
|
||||
if not geometries or len(geometries) < 2:
|
||||
return result
|
||||
|
||||
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
|
||||
best_alpha_score = 0.0
|
||||
best_alpha_col = -1
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Deduplicate consecutive identical words (OCR double-reads)
|
||||
deduped = [texts[0]]
|
||||
for t in texts[1:]:
|
||||
if t != deduped[-1]:
|
||||
deduped.append(t)
|
||||
if len(deduped) < 5:
|
||||
continue
|
||||
# Count consecutive pairs in alphabetical order
|
||||
ordered_pairs = sum(
|
||||
1 for i in range(len(deduped) - 1)
|
||||
if deduped[i] <= deduped[i + 1]
|
||||
)
|
||||
alpha_score = ordered_pairs / (len(deduped) - 1)
|
||||
if alpha_score > best_alpha_score:
|
||||
best_alpha_score = alpha_score
|
||||
best_alpha_col = geom.index
|
||||
|
||||
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
|
||||
result["signals"]["alphabetical_col"] = best_alpha_col
|
||||
|
||||
# --- Signal 2: Article detection (weight 0.25) ---
|
||||
# Check three patterns:
|
||||
# (a) Dedicated narrow article column (der/die/das only)
|
||||
# (b) Inline articles: multi-word texts starting with "der X", "die X"
|
||||
# (c) High article word frequency: many individual words ARE articles
|
||||
# (common when OCR splits "der Zustand" into separate word_boxes)
|
||||
best_article_density = 0.0
|
||||
best_article_col = -1
|
||||
best_inline_article_ratio = 0.0
|
||||
best_article_word_ratio = 0.0
|
||||
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in geom.words
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
|
||||
]
|
||||
if len(texts) < 3:
|
||||
continue
|
||||
|
||||
# (a) Dedicated article column: narrow, mostly article words
|
||||
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
|
||||
if geom.width_ratio <= 0.20:
|
||||
density = article_count / len(texts)
|
||||
if density > best_article_density:
|
||||
best_article_density = density
|
||||
best_article_col = geom.index
|
||||
|
||||
# (b) Inline articles: "der Zustand", "die Zutat", etc.
|
||||
inline_count = sum(
|
||||
1 for t in texts
|
||||
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
|
||||
)
|
||||
inline_ratio = inline_count / len(texts)
|
||||
if inline_ratio > best_inline_article_ratio:
|
||||
best_inline_article_ratio = inline_ratio
|
||||
|
||||
# (c) Article word frequency in any column (for OCR-split word_boxes)
|
||||
# In dictionaries, articles appear frequently among headwords
|
||||
# Require at least 10% articles and >= 3 article words
|
||||
if article_count >= 3:
|
||||
art_ratio = article_count / len(texts)
|
||||
# Only count if column has enough non-article words too
|
||||
# (pure article column is handled by (a))
|
||||
non_art = len(texts) - article_count
|
||||
if non_art >= 3 and art_ratio > best_article_word_ratio:
|
||||
best_article_word_ratio = art_ratio
|
||||
|
||||
# Use the strongest signal
|
||||
effective_article_score = max(
|
||||
best_article_density,
|
||||
best_inline_article_ratio,
|
||||
best_article_word_ratio * 0.8, # slight discount for raw word ratio
|
||||
)
|
||||
|
||||
result["signals"]["article_density"] = round(best_article_density, 3)
|
||||
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
|
||||
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
|
||||
result["signals"]["article_col"] = best_article_col
|
||||
|
||||
# --- Signal 3: First-letter uniformity (weight 0.25) ---
|
||||
best_uniformity = 0.0
|
||||
best_uniform_col = -1
|
||||
has_letter_transition = False
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Count first letters
|
||||
first_letters = [t[0] for t in texts if t[0].isalpha()]
|
||||
if not first_letters:
|
||||
continue
|
||||
letter_counts = Counter(first_letters)
|
||||
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
|
||||
uniformity = most_common_count / len(first_letters)
|
||||
|
||||
# Check for orderly letter transitions (A→B or Y→Z)
|
||||
# Group consecutive words by first letter, check if groups are in order
|
||||
groups = []
|
||||
current_letter = first_letters[0]
|
||||
for fl in first_letters:
|
||||
if fl != current_letter:
|
||||
groups.append(current_letter)
|
||||
current_letter = fl
|
||||
groups.append(current_letter)
|
||||
if len(groups) >= 2 and len(groups) <= 5:
|
||||
# Check if groups are alphabetically ordered
|
||||
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
|
||||
has_letter_transition = True
|
||||
# Boost uniformity for orderly transitions
|
||||
uniformity = max(uniformity, 0.70)
|
||||
|
||||
if uniformity > best_uniformity:
|
||||
best_uniformity = uniformity
|
||||
best_uniform_col = geom.index
|
||||
|
||||
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
|
||||
result["signals"]["uniform_col"] = best_uniform_col
|
||||
result["signals"]["has_letter_transition"] = has_letter_transition
|
||||
|
||||
# --- Signal 4: Decorative margin strip (weight 0.15) ---
|
||||
result["signals"]["margin_strip_detected"] = margin_strip_detected
|
||||
|
||||
# --- Combine signals ---
|
||||
s1 = min(best_alpha_score, 1.0) * 0.35
|
||||
s2 = min(effective_article_score, 1.0) * 0.25
|
||||
s3 = min(best_uniformity, 1.0) * 0.25
|
||||
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
|
||||
|
||||
combined = s1 + s2 + s3 + s4
|
||||
|
||||
# Boost if user set document_category to 'woerterbuch'
|
||||
if document_category == "woerterbuch":
|
||||
combined = min(1.0, combined + 0.20)
|
||||
result["signals"]["category_boost"] = True
|
||||
|
||||
result["confidence"] = round(combined, 3)
|
||||
|
||||
# Threshold: combined >= 0.40 to classify as dictionary
|
||||
# (at least 2 strong signals or 3 moderate ones)
|
||||
if combined >= 0.40:
|
||||
result["is_dictionary"] = True
|
||||
# Identify headword column: best alphabetical OR best uniform
|
||||
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
|
||||
result["headword_col_index"] = best_alpha_col
|
||||
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
|
||||
result["headword_col_index"] = best_uniform_col
|
||||
if best_article_col >= 0 and best_article_density >= 0.30:
|
||||
result["article_col_index"] = best_article_col
|
||||
# If inline articles are strong but no dedicated column, note it
|
||||
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
|
||||
result["signals"]["inline_articles_detected"] = True
|
||||
|
||||
logger.info(
|
||||
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
|
||||
combined, result["is_dictionary"], result["signals"],
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _classify_dictionary_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
dict_signals: Dict[str, Any],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
content_h: int,
|
||||
) -> Optional[List[PageRegion]]:
|
||||
"""Classify columns for a detected dictionary page.
|
||||
|
||||
Assigns column_headword, column_article, column_ipa, and
|
||||
column_de/column_en based on dictionary signals and language scores.
|
||||
|
||||
Returns None if classification fails.
|
||||
"""
|
||||
if not dict_signals.get("is_dictionary"):
|
||||
return None
|
||||
|
||||
regions: List[PageRegion] = []
|
||||
assigned = set()
|
||||
article_idx = dict_signals.get("article_col_index")
|
||||
headword_idx = dict_signals.get("headword_col_index")
|
||||
|
||||
# 1. Assign article column if detected
|
||||
if article_idx is not None:
|
||||
for geom in geometries:
|
||||
if geom.index == article_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_article",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["signals"].get("article_density", 0.5), 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 2. Assign headword column
|
||||
if headword_idx is not None and headword_idx not in assigned:
|
||||
for geom in geometries:
|
||||
if geom.index == headword_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_headword",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["confidence"], 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 3. Assign remaining columns by language + content
|
||||
remaining = [g for g in geometries if g.index not in assigned]
|
||||
for geom in remaining:
|
||||
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
|
||||
|
||||
# Check if column contains IPA (brackets like [, /, ˈ)
|
||||
ipa_chars = sum(
|
||||
1 for w in geom.words
|
||||
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
|
||||
)
|
||||
ipa_ratio = ipa_chars / max(len(geom.words), 1)
|
||||
|
||||
if ipa_ratio > 0.25:
|
||||
col_type = "column_ipa"
|
||||
conf = round(min(1.0, ipa_ratio), 2)
|
||||
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
|
||||
col_type = "column_de"
|
||||
conf = round(ls["deu"], 2)
|
||||
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
|
||||
col_type = "column_en"
|
||||
conf = round(ls["eng"], 2)
|
||||
else:
|
||||
# Positional fallback: leftmost unassigned = EN, next = DE
|
||||
left_unassigned = sorted(
|
||||
[g for g in remaining if g.index not in assigned],
|
||||
key=lambda g: g.x,
|
||||
)
|
||||
if geom == left_unassigned[0] if left_unassigned else None:
|
||||
col_type = "column_en"
|
||||
else:
|
||||
col_type = "column_de"
|
||||
conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type=col_type,
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
493
klausur-service/backend/cv_ocr_cell_filter.py
Normal file
493
klausur-service/backend/cv_ocr_cell_filter.py
Normal file
@@ -0,0 +1,493 @@
|
||||
"""
|
||||
Cell text filtering, column/row word assignment, and bold detection.
|
||||
|
||||
This module contains:
|
||||
- _assign_row_words_to_columns(): spatial assignment of OCR words to grid columns
|
||||
- Cell text noise filtering (_clean_cell_text, _clean_cell_text_lite, etc.)
|
||||
- Bold detection via stroke-width analysis (_measure_stroke_width, _classify_bold_cells)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion, RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Column / Row word assignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _assign_row_words_to_columns(
|
||||
row: RowGeometry,
|
||||
columns: List[PageRegion],
|
||||
) -> Dict[int, List[Dict]]:
|
||||
"""Assign each word in a row to exactly one column.
|
||||
|
||||
Uses a two-pass strategy:
|
||||
1. Containment: if a word's center falls within a column's horizontal
|
||||
bounds (with padding), assign it to that column.
|
||||
2. Nearest center: for words not contained by any column, fall back to
|
||||
nearest column center distance.
|
||||
|
||||
This prevents long sentences in wide columns (e.g. example) from having
|
||||
their rightmost words stolen by an adjacent column.
|
||||
|
||||
Args:
|
||||
row: Row with words (relative coordinates).
|
||||
columns: Sorted list of columns (absolute coordinates).
|
||||
|
||||
Returns:
|
||||
Dict mapping col_index -> list of words assigned to that column.
|
||||
"""
|
||||
result: Dict[int, List[Dict]] = {i: [] for i in range(len(columns))}
|
||||
|
||||
if not row.words or not columns:
|
||||
return result
|
||||
|
||||
left_x = row.x # content ROI left (absolute)
|
||||
|
||||
# Build non-overlapping column assignment ranges using midpoints.
|
||||
# For adjacent columns, the boundary is the midpoint between them.
|
||||
# This prevents words near column borders from being assigned to
|
||||
# the wrong column (e.g. "We" at the start of an example sentence
|
||||
# being stolen by the preceding DE column).
|
||||
n = len(columns)
|
||||
col_ranges_rel = [] # (assign_left, assign_right) per column
|
||||
for ci, col in enumerate(columns):
|
||||
col_left_rel = col.x - left_x
|
||||
col_right_rel = col_left_rel + col.width
|
||||
|
||||
# Left boundary: midpoint to previous column, or 0
|
||||
if ci == 0:
|
||||
assign_left = 0
|
||||
else:
|
||||
prev_right = columns[ci - 1].x - left_x + columns[ci - 1].width
|
||||
assign_left = (prev_right + col_left_rel) / 2
|
||||
|
||||
# Right boundary: midpoint to next column, or infinity (row width)
|
||||
if ci == n - 1:
|
||||
assign_right = row.width + 100 # generous for last column
|
||||
else:
|
||||
next_left = columns[ci + 1].x - left_x
|
||||
assign_right = (col_right_rel + next_left) / 2
|
||||
|
||||
col_ranges_rel.append((assign_left, assign_right))
|
||||
|
||||
for w in row.words:
|
||||
w_left = w['left']
|
||||
w_right = w_left + w['width']
|
||||
w_center_x = w_left + w['width'] / 2
|
||||
|
||||
# Primary: overlap-based matching — assign to column with most overlap.
|
||||
# This is more robust than center-based for narrow columns (page_ref)
|
||||
# where the last character's center may fall into the next column.
|
||||
best_col = -1
|
||||
best_overlap = 0
|
||||
for ci, col in enumerate(columns):
|
||||
col_left_rel = col.x - left_x
|
||||
col_right_rel = col_left_rel + col.width
|
||||
overlap = max(0, min(w_right, col_right_rel) - max(w_left, col_left_rel))
|
||||
if overlap > best_overlap:
|
||||
best_overlap = overlap
|
||||
best_col = ci
|
||||
|
||||
if best_col >= 0 and best_overlap > 0:
|
||||
result[best_col].append(w)
|
||||
else:
|
||||
# Fallback: center-based range matching
|
||||
assigned = False
|
||||
for ci, (al, ar) in enumerate(col_ranges_rel):
|
||||
if al <= w_center_x < ar:
|
||||
result[ci].append(w)
|
||||
assigned = True
|
||||
break
|
||||
|
||||
if not assigned:
|
||||
# Last resort: nearest column center
|
||||
best_col = 0
|
||||
col_left_0 = columns[0].x - left_x
|
||||
best_dist = abs(w_center_x - (col_left_0 + columns[0].width / 2))
|
||||
for ci in range(1, n):
|
||||
col_left = columns[ci].x - left_x
|
||||
dist = abs(w_center_x - (col_left + columns[ci].width / 2))
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_col = ci
|
||||
result[best_col].append(w)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cell text noise filtering
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Regex: at least 2 consecutive letters (Latin + umlauts + accents)
|
||||
_RE_REAL_WORD = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]{2,}')
|
||||
_RE_ALPHA = re.compile(r'[a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]')
|
||||
|
||||
# Common short EN/DE words (2-3 chars). Tokens at the end of a cell
|
||||
# that do NOT appear here are treated as trailing OCR noise.
|
||||
_COMMON_SHORT_WORDS: set = {
|
||||
# EN 1-2 letter
|
||||
'a', 'i', 'am', 'an', 'as', 'at', 'be', 'by', 'do', 'go', 'he',
|
||||
'if', 'in', 'is', 'it', 'me', 'my', 'no', 'of', 'oh', 'ok', 'on',
|
||||
'or', 'so', 'to', 'up', 'us', 'we',
|
||||
# EN 3 letter
|
||||
'ace', 'act', 'add', 'age', 'ago', 'aid', 'aim', 'air', 'all',
|
||||
'and', 'ant', 'any', 'ape', 'arc', 'are', 'ark', 'arm', 'art',
|
||||
'ask', 'ate', 'axe', 'bad', 'bag', 'ban', 'bar', 'bat', 'bay',
|
||||
'bed', 'bee', 'bet', 'big', 'bin', 'bit', 'bow', 'box', 'boy',
|
||||
'bud', 'bug', 'bun', 'bus', 'but', 'buy', 'cab', 'can', 'cap',
|
||||
'car', 'cat', 'cop', 'cow', 'cry', 'cub', 'cup', 'cut', 'dad',
|
||||
'dam', 'day', 'den', 'dew', 'did', 'die', 'dig', 'dim', 'dip',
|
||||
'dog', 'dot', 'dry', 'due', 'dug', 'dye', 'ear', 'eat', 'eel',
|
||||
'egg', 'elm', 'end', 'era', 'eve', 'ewe', 'eye', 'fan', 'far',
|
||||
'fat', 'fax', 'fed', 'fee', 'few', 'fig', 'fin', 'fir', 'fit',
|
||||
'fix', 'fly', 'foe', 'fog', 'for', 'fox', 'fry', 'fun', 'fur',
|
||||
'gag', 'gap', 'gas', 'get', 'god', 'got', 'gum', 'gun', 'gut',
|
||||
'guy', 'gym', 'had', 'ham', 'has', 'hat', 'hay', 'hen', 'her',
|
||||
'hid', 'him', 'hip', 'his', 'hit', 'hog', 'hop', 'hot', 'how',
|
||||
'hue', 'hug', 'hum', 'hut', 'ice', 'icy', 'ill', 'imp', 'ink',
|
||||
'inn', 'ion', 'its', 'ivy', 'jam', 'jar', 'jaw', 'jay', 'jet',
|
||||
'jig', 'job', 'jog', 'joy', 'jug', 'key', 'kid', 'kin', 'kit',
|
||||
'lab', 'lad', 'lag', 'lap', 'law', 'lay', 'led', 'leg', 'let',
|
||||
'lid', 'lie', 'lip', 'lit', 'log', 'lot', 'low', 'mad', 'man',
|
||||
'map', 'mat', 'maw', 'may', 'men', 'met', 'mid', 'mix', 'mob',
|
||||
'mog', 'mom', 'mop', 'mow', 'mrs', 'mud', 'mug', 'mum', 'nag',
|
||||
'nap', 'net', 'new', 'nod', 'nor', 'not', 'now', 'nun', 'nut',
|
||||
'oak', 'oar', 'oat', 'odd', 'off', 'oft', 'oil', 'old', 'one',
|
||||
'opt', 'orb', 'ore', 'our', 'out', 'owe', 'owl', 'own', 'pad',
|
||||
'pal', 'pan', 'pat', 'paw', 'pay', 'pea', 'peg', 'pen', 'per',
|
||||
'pet', 'pie', 'pig', 'pin', 'pit', 'ply', 'pod', 'pop', 'pot',
|
||||
'pro', 'pry', 'pub', 'pug', 'pun', 'pup', 'put', 'rag', 'ram',
|
||||
'ran', 'rap', 'rat', 'raw', 'ray', 'red', 'ref', 'rib', 'rid',
|
||||
'rig', 'rim', 'rip', 'rob', 'rod', 'roe', 'rot', 'row', 'rub',
|
||||
'rug', 'rum', 'run', 'rut', 'rye', 'sac', 'sad', 'sag', 'sap',
|
||||
'sat', 'saw', 'say', 'sea', 'set', 'sew', 'she', 'shy', 'sin',
|
||||
'sip', 'sir', 'sis', 'sit', 'six', 'ski', 'sky', 'sly', 'sob',
|
||||
'sod', 'son', 'sop', 'sot', 'sow', 'soy', 'spa', 'spy', 'sty',
|
||||
'sub', 'sue', 'sum', 'sun', 'sup', 'tab', 'tad', 'tag', 'tan',
|
||||
'tap', 'tar', 'tax', 'tea', 'ten', 'the', 'tie', 'tin', 'tip',
|
||||
'toe', 'ton', 'too', 'top', 'tow', 'toy', 'try', 'tub', 'tug',
|
||||
'two', 'urn', 'use', 'van', 'vat', 'vet', 'via', 'vie', 'vim',
|
||||
'vow', 'wag', 'war', 'was', 'wax', 'way', 'web', 'wed', 'wet',
|
||||
'who', 'why', 'wig', 'win', 'wit', 'woe', 'wok', 'won', 'woo',
|
||||
'wow', 'yam', 'yap', 'yaw', 'yea', 'yes', 'yet', 'yew', 'you',
|
||||
'zap', 'zip', 'zoo',
|
||||
# DE 2-3 letter
|
||||
'ab', 'da', 'du', 'ei', 'er', 'es', 'ja', 'ob', 'um', 'zu',
|
||||
'als', 'alt', 'auf', 'aus', 'bei', 'bin', 'bis', 'das', 'dem',
|
||||
'den', 'der', 'des', 'die', 'dir', 'ehe', 'ein', 'eng', 'gar',
|
||||
'gib', 'gut', 'hat', 'her', 'ich', 'ihm', 'ihr', 'ins', 'ist',
|
||||
'mal', 'man', 'mir', 'mit', 'nah', 'neu', 'nie', 'nur', 'nun',
|
||||
'ort', 'rad', 'rat', 'rot', 'ruf', 'ruh', 'sei', 'sie', 'tag',
|
||||
'tal', 'tat', 'tee', 'tor', 'tun', 'tut', 'uns', 'vom', 'von',
|
||||
'vor', 'war', 'was', 'weg', 'wem', 'wen', 'wer', 'wie', 'wir',
|
||||
'wut', 'zum', 'zur',
|
||||
}
|
||||
|
||||
# Known abbreviations found in EN/DE textbooks and dictionaries.
|
||||
# Stored WITHOUT trailing period (the noise filter strips periods).
|
||||
# These rescue tokens like "sth." / "sb." / "usw." from being deleted.
|
||||
_KNOWN_ABBREVIATIONS: set = {
|
||||
# EN dictionary meta-words
|
||||
'sth', 'sb', 'smth', 'smb', 'sbd',
|
||||
# EN general
|
||||
'etc', 'eg', 'ie', 'esp', 'approx', 'dept', 'govt', 'corp',
|
||||
'inc', 'ltd', 'vs', 'cf', 'ibid', 'nb', 'ps', 'asap',
|
||||
# EN references / textbook
|
||||
'p', 'pp', 'ch', 'chap', 'fig', 'figs', 'no', 'nos', 'nr',
|
||||
'vol', 'vols', 'ed', 'eds', 'rev', 'repr', 'trans', 'ff',
|
||||
'fn', 'sec', 'par', 'para', 'app', 'abbr', 'ex', 'exs',
|
||||
'ans', 'wb', 'tb', 'vocab',
|
||||
# EN parts of speech / grammar
|
||||
'adj', 'adv', 'prep', 'conj', 'pron', 'det', 'art', 'interj',
|
||||
'aux', 'mod', 'inf', 'pt', 'pres', 'pret', 'ger',
|
||||
'sg', 'pl', 'sing', 'irreg', 'reg', 'intr', 'intrans',
|
||||
'refl', 'pass', 'imper', 'subj', 'ind', 'perf', 'fut',
|
||||
'attr', 'pred', 'comp', 'superl', 'pos', 'neg',
|
||||
'lit', 'colloq', 'sl', 'dial', 'arch', 'obs', 'fml', 'infml',
|
||||
'syn', 'ant', 'opp', 'var', 'orig',
|
||||
# EN titles
|
||||
'mr', 'mrs', 'ms', 'dr', 'prof', 'st', 'jr', 'sr',
|
||||
# EN pronunciation
|
||||
'br', 'am', 'brit', 'amer',
|
||||
# EN units
|
||||
'hr', 'hrs', 'min', 'km', 'cm', 'mm', 'kg', 'mg', 'ml',
|
||||
# DE general
|
||||
'usw', 'bzw', 'evtl', 'ggf', 'ggfs', 'sog', 'eigtl', 'allg',
|
||||
'bes', 'insb', 'insbes', 'bspw', 'ca',
|
||||
'od', 'ua', 'sa', 'vgl', 'zb', 'dh', 'zt', 'idr',
|
||||
'inkl', 'exkl', 'zzgl', 'abzgl',
|
||||
# DE references
|
||||
'abs', 'abschn', 'abt', 'anm', 'ausg', 'aufl', 'bd', 'bde',
|
||||
'bearb', 'ebd', 'hrsg', 'hg', 'jg', 'jh', 'jhd', 'kap',
|
||||
's', 'sp', 'zit', 'zs', 'vlg',
|
||||
# DE grammar
|
||||
'nom', 'akk', 'dat', 'gen', 'konj', 'subst', 'obj',
|
||||
'praet', 'imp', 'part', 'mask', 'fem', 'neutr',
|
||||
'trennb', 'untrennb', 'ugs', 'geh', 'pej',
|
||||
# DE regional
|
||||
'nordd', 'österr', 'schweiz',
|
||||
# Linguistic
|
||||
'lex', 'morph', 'phon', 'phonet', 'sem', 'synt', 'etym',
|
||||
'deriv', 'pref', 'suf', 'suff', 'dim', 'coll',
|
||||
'count', 'uncount', 'indef', 'def', 'poss', 'demon',
|
||||
}
|
||||
|
||||
|
||||
def _is_noise_tail_token(token: str) -> bool:
|
||||
"""Check if a token at the END of cell text is trailing OCR noise.
|
||||
|
||||
Trailing fragments are very common OCR artifacts from image edges,
|
||||
borders, and neighbouring cells. This is more aggressive than a
|
||||
general word filter: any short token that isn't in the dictionary
|
||||
of common EN/DE words is considered noise.
|
||||
|
||||
Examples of noise: "Es)", "3", "ee", "B"
|
||||
Examples to keep: "sister.", "cupcakes.", "...", "mice", "[eg]"
|
||||
"""
|
||||
t = token.strip()
|
||||
if not t:
|
||||
return True
|
||||
|
||||
# Keep ellipsis
|
||||
if t in ('...', '…'):
|
||||
return False
|
||||
|
||||
# Keep phonetic brackets: [eg], [maus], ["a:mand], serva], etc.
|
||||
if t.startswith('[') or t.startswith('["') or t.startswith("['"):
|
||||
return False
|
||||
if t.endswith(']'):
|
||||
return False
|
||||
|
||||
# Keep meaningful punctuation tokens used in textbooks
|
||||
# = (definition marker), (= (definition opener), ; (separator)
|
||||
if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'):
|
||||
return False
|
||||
|
||||
# Pure non-alpha -> noise ("3", ")", "|")
|
||||
alpha_chars = _RE_ALPHA.findall(t)
|
||||
if not alpha_chars:
|
||||
return True
|
||||
|
||||
# Extract only alpha characters for dictionary lookup
|
||||
cleaned = ''.join(alpha_chars)
|
||||
|
||||
# Known abbreviations (e.g. "sth.", "usw.", "adj.") — always keep
|
||||
if cleaned.lower() in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
|
||||
# Strip normal trailing punctuation before checking for internal noise.
|
||||
stripped_punct = re.sub(r'[.,;:!?]+$', '', t) # "cupcakes." -> "cupcakes"
|
||||
t_check = stripped_punct if stripped_punct else t
|
||||
|
||||
# Check for legitimate punctuation patterns vs. real noise.
|
||||
# Legitimate: "(auf)", "under-", "e.g.", "(on)", "selbst)", "(wir",
|
||||
# "(Salat-)Gurke", "Tanz(veranstaltung)", "(zer)brechen"
|
||||
# Noise: "3d", "B|", "x7"
|
||||
# Strategy: strip common dictionary punctuation (parens, hyphens, slashes),
|
||||
# THEN check if residual contains only alpha characters.
|
||||
t_inner = t_check
|
||||
# Remove all parentheses, hyphens, slashes, and dots — these are normal
|
||||
# in dictionary entries: "(Salat-)Gurke", "Tanz(veranstaltung)",
|
||||
# "(zer)brechen", "wir/uns", "e.g."
|
||||
t_inner = re.sub(r'[()\-/.,;:!?]', '', t_inner)
|
||||
# Now check: does the inner form still have non-alpha noise?
|
||||
inner_alpha = ''.join(_RE_ALPHA.findall(t_inner))
|
||||
has_internal_noise = (len(t_inner) > len(inner_alpha)) if t_inner else False
|
||||
|
||||
# Long alpha words (4+ chars) without internal noise are likely real
|
||||
if len(cleaned) >= 4 and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Short words: check dictionary (uses only alpha chars)
|
||||
if cleaned.lower() in _COMMON_SHORT_WORDS and not has_internal_noise:
|
||||
return False
|
||||
|
||||
# Default: short or suspicious -> noise
|
||||
return True
|
||||
|
||||
|
||||
def _is_garbage_text(text: str) -> bool:
|
||||
"""Check if entire cell text is OCR garbage from image areas.
|
||||
|
||||
Garbage text = no recognizable dictionary word. Catches
|
||||
"(ci]oeu", "uanoaain." etc.
|
||||
"""
|
||||
words = _RE_REAL_WORD.findall(text)
|
||||
if not words:
|
||||
# Check if any token is a known abbreviation (e.g. "e.g.")
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(text)).lower()
|
||||
if alpha_only in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
return True
|
||||
|
||||
for w in words:
|
||||
wl = w.lower()
|
||||
# Known short word or abbreviation -> not garbage
|
||||
if wl in _COMMON_SHORT_WORDS or wl in _KNOWN_ABBREVIATIONS:
|
||||
return False
|
||||
# Long word (>= 4 chars): check vowel/consonant ratio.
|
||||
# Real EN/DE words have 20-60% vowels. Garbage like "uanoaain"
|
||||
# or "cioeu" has unusual ratios (too many or too few vowels).
|
||||
if len(wl) >= 4:
|
||||
vowels = sum(1 for c in wl if c in 'aeiouäöü')
|
||||
ratio = vowels / len(wl)
|
||||
if 0.15 <= ratio <= 0.65:
|
||||
return False # plausible vowel ratio -> real word
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _clean_cell_text(text: str) -> str:
|
||||
"""Remove OCR noise from cell text. Generic filters:
|
||||
|
||||
1. If the entire text has no real alphabetic word (>= 2 letters), clear.
|
||||
2. If the entire text is garbage (no dictionary word), clear.
|
||||
3. Strip trailing noise tokens from the end of the text.
|
||||
"""
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return ''
|
||||
|
||||
# --- Filter 1: No real word at all ---
|
||||
if not _RE_REAL_WORD.search(stripped):
|
||||
# Exception: dotted abbreviations like "e.g.", "z.B.", "i.e."
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||||
return ''
|
||||
|
||||
# --- Filter 2: Entire text is garbage ---
|
||||
if _is_garbage_text(stripped):
|
||||
return ''
|
||||
|
||||
# --- Filter 3: Strip trailing noise tokens ---
|
||||
tokens = stripped.split()
|
||||
while tokens and _is_noise_tail_token(tokens[-1]):
|
||||
tokens.pop()
|
||||
if not tokens:
|
||||
return ''
|
||||
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def _clean_cell_text_lite(text: str) -> str:
|
||||
"""Simplified noise filter for cell-first OCR (isolated cell crops).
|
||||
|
||||
Since each cell is OCR'd in isolation (no neighbour content visible),
|
||||
trailing-noise stripping is unnecessary. Only 2 filters remain:
|
||||
|
||||
1. No real alphabetic word (>= 2 letters) and not a known abbreviation -> empty.
|
||||
2. Entire text is garbage (no dictionary word) -> empty.
|
||||
"""
|
||||
stripped = text.strip()
|
||||
if not stripped:
|
||||
return ''
|
||||
|
||||
# --- Filter 1: No real word at all ---
|
||||
if not _RE_REAL_WORD.search(stripped):
|
||||
alpha_only = ''.join(_RE_ALPHA.findall(stripped)).lower()
|
||||
if alpha_only not in _KNOWN_ABBREVIATIONS:
|
||||
return ''
|
||||
|
||||
# --- Filter 2: Entire text is garbage ---
|
||||
if _is_garbage_text(stripped):
|
||||
return ''
|
||||
|
||||
return stripped
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bold detection via stroke-width analysis (relative / page-level)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _measure_stroke_width(gray_crop: np.ndarray) -> float:
|
||||
"""Measure mean stroke width in a binarised cell crop.
|
||||
|
||||
Returns a DPI-normalised value (mean stroke width as % of crop height),
|
||||
or 0.0 if measurement is not possible.
|
||||
"""
|
||||
if gray_crop is None or gray_crop.size == 0:
|
||||
return 0.0
|
||||
h, w = gray_crop.shape[:2]
|
||||
if h < 10 or w < 10:
|
||||
return 0.0
|
||||
|
||||
# Binarise: text = white (255), background = black (0)
|
||||
_, bw = cv2.threshold(gray_crop, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
|
||||
if cv2.countNonZero(bw) < 20:
|
||||
return 0.0
|
||||
|
||||
# Distance transform: value at each white pixel = distance to nearest black
|
||||
dist = cv2.distanceTransform(bw, cv2.DIST_L2, 3)
|
||||
|
||||
# Skeleton via morphological thinning
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
|
||||
thin = bw.copy()
|
||||
for _ in range(max(1, min(h, w) // 6)):
|
||||
eroded = cv2.erode(thin, kernel)
|
||||
if cv2.countNonZero(eroded) < 5:
|
||||
break
|
||||
thin = eroded
|
||||
|
||||
skeleton_pts = thin > 0
|
||||
if not np.any(skeleton_pts):
|
||||
return 0.0
|
||||
mean_stroke = float(np.mean(dist[skeleton_pts]))
|
||||
return mean_stroke / max(h, 1) * 100 # normalised: % of cell height
|
||||
|
||||
|
||||
def _classify_bold_cells(cells: List[Dict[str, Any]], ocr_img: Optional[np.ndarray],
|
||||
img_w: int, img_h: int) -> None:
|
||||
"""Two-pass bold detection: measure all cells, then compare against median.
|
||||
|
||||
Cells with stroke width > 1.4x the page median are marked as bold.
|
||||
This adapts automatically to font, DPI and scan quality.
|
||||
Modifies cells in-place (sets 'is_bold' key).
|
||||
"""
|
||||
if ocr_img is None:
|
||||
return
|
||||
|
||||
# Pass 1: measure stroke width for every cell with text
|
||||
metrics: List[float] = []
|
||||
cell_strokes: List[float] = []
|
||||
for cell in cells:
|
||||
sw = 0.0
|
||||
if cell.get('text', '').strip():
|
||||
bp = cell['bbox_px']
|
||||
y1 = max(0, bp['y'])
|
||||
y2 = min(img_h, bp['y'] + bp['h'])
|
||||
x1 = max(0, bp['x'])
|
||||
x2 = min(img_w, bp['x'] + bp['w'])
|
||||
if y2 > y1 and x2 > x1:
|
||||
sw = _measure_stroke_width(ocr_img[y1:y2, x1:x2])
|
||||
cell_strokes.append(sw)
|
||||
if sw > 0:
|
||||
metrics.append(sw)
|
||||
|
||||
if len(metrics) < 3:
|
||||
# Too few cells to compare — leave all as non-bold
|
||||
return
|
||||
|
||||
median_sw = float(np.median(metrics))
|
||||
if median_sw <= 0:
|
||||
return
|
||||
|
||||
# Pass 2: cells significantly above median -> bold
|
||||
for cell, sw in zip(cells, cell_strokes):
|
||||
cell['is_bold'] = sw > 0 and (sw / median_sw) > 1.4
|
||||
189
klausur-service/backend/cv_ocr_cell_phonetics.py
Normal file
189
klausur-service/backend/cv_ocr_cell_phonetics.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Cell-level IPA phonetic fixes for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Split from cv_ocr_engines.py — contains fix_cell_phonetics() and helpers.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_insert_missing_ipa,
|
||||
_replace_phonetics_in_text,
|
||||
_text_has_garbled_ipa,
|
||||
)
|
||||
from cv_ocr_ipa_repair import (
|
||||
_has_non_dict_trailing,
|
||||
_insert_headword_ipa,
|
||||
_strip_post_bracket_garbled,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fix_cell_phonetics(
|
||||
cells: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply IPA phonetic fixes to cell texts for overlay mode.
|
||||
|
||||
In the normal pipeline, _fix_phonetic_brackets operates on vocab entries
|
||||
(entry['english']). But the overlay reads cell['text'] directly, so
|
||||
phonetic fixes must be applied to cells too.
|
||||
|
||||
Processing depends on column type:
|
||||
- column_en: Full processing (replace garbled IPA + strip orphan brackets
|
||||
+ insert missing IPA). Safe because these cells contain only English
|
||||
headwords.
|
||||
- column_text: Light processing (replace garbled IPA ONLY). No orphan
|
||||
bracket stripping (brackets may be German content like "(probieren)")
|
||||
and no IPA insertion (would add tokens and break overlay positioning).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return cells
|
||||
|
||||
ipa_col_types = {'column_en', 'column_text'}
|
||||
replaced = 0
|
||||
|
||||
for cell in cells:
|
||||
col_type = cell.get('col_type', '')
|
||||
if col_type not in ipa_col_types:
|
||||
continue
|
||||
text = cell.get('text', '') or ''
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
if col_type == 'column_en':
|
||||
# Full processing: replace garbled IPA, strip orphan brackets.
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=True)
|
||||
if new_text == text:
|
||||
# Insert IPA when garbled phonetics exist OR when trailing
|
||||
# non-dictionary words suggest garbled IPA in plain ASCII.
|
||||
if _text_has_garbled_ipa(text) or _has_non_dict_trailing(text, pronunciation):
|
||||
new_text = _insert_missing_ipa(text, pronunciation)
|
||||
# Strip trailing garbled fragments after proper [IPA] brackets
|
||||
# (e.g. "sea [sˈiː] si:" → "sea [sˈiː]")
|
||||
if ']' in new_text:
|
||||
new_text = _strip_post_bracket_garbled(new_text, pronunciation)
|
||||
else:
|
||||
# column_text: replace garbled IPA, no orphan stripping
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation, strip_orphans=False)
|
||||
# Insert headword IPA ONLY if there's a gap in word_boxes
|
||||
# suggesting Tesseract missed an IPA bracket on the page.
|
||||
# Without gap evidence, the original page had no IPA.
|
||||
if new_text == text:
|
||||
wb = cell.get('word_boxes', [])
|
||||
if _has_ipa_gap(text, wb):
|
||||
inserted = _insert_headword_ipa(text, pronunciation)
|
||||
if inserted != text:
|
||||
new_text = inserted
|
||||
_sync_word_boxes_after_ipa_insert(cell, text, new_text)
|
||||
|
||||
if new_text != text:
|
||||
logger.debug(f"fix_cell_phonetics: '{text}' → '{new_text}'")
|
||||
cell['text'] = new_text
|
||||
replaced += 1
|
||||
|
||||
if replaced:
|
||||
logger.info(f"fix_cell_phonetics: {replaced} IPA fixes in {len(cells)} cells")
|
||||
return cells
|
||||
|
||||
|
||||
def _has_ipa_gap(text: str, word_boxes: List[Dict]) -> bool:
|
||||
"""Check if word_boxes show a gap where IPA brackets should be.
|
||||
|
||||
On a typical vocab page, the layout is:
|
||||
headword [ipa] German translation
|
||||
|
||||
If Tesseract missed the IPA bracket, the gap between the headword
|
||||
and the next word (German translation) is unusually large (>80px)
|
||||
because the IPA occupied physical space on the page.
|
||||
|
||||
If no IPA was on the page (e.g. "be good at sth."), the words are
|
||||
close together (<30px).
|
||||
"""
|
||||
if not word_boxes or len(word_boxes) < 2:
|
||||
return False
|
||||
|
||||
tokens = text.split()
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# Find the headword index: skip numeric prefixes like "».55", "0.56"
|
||||
hw_box_idx = 0
|
||||
for i, wb in enumerate(word_boxes):
|
||||
wt = wb.get('text', '')
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß]', '', wt)
|
||||
if len(clean) >= 2:
|
||||
hw_box_idx = i
|
||||
break
|
||||
|
||||
if hw_box_idx >= len(word_boxes) - 1:
|
||||
return False
|
||||
|
||||
# Check gap between headword and the next word_box
|
||||
hw = word_boxes[hw_box_idx]
|
||||
next_wb = word_boxes[hw_box_idx + 1]
|
||||
gap = next_wb['left'] - (hw['left'] + hw['width'])
|
||||
|
||||
return gap > 80
|
||||
|
||||
|
||||
def _sync_word_boxes_after_ipa_insert(
|
||||
cell: Dict[str, Any],
|
||||
old_text: str,
|
||||
new_text: str,
|
||||
) -> None:
|
||||
"""Insert a synthetic word_box for an IPA token added by IPA insertion.
|
||||
|
||||
E.g. "challenge ..." → "challenge [tʃælɪndʒ] ..."
|
||||
Adds a new word_box right after the headword's box so the 1:1
|
||||
token-to-box mapping in the frontend overlay stays consistent.
|
||||
"""
|
||||
word_boxes = cell.get('word_boxes')
|
||||
if not word_boxes:
|
||||
return
|
||||
|
||||
old_tokens = old_text.split()
|
||||
new_tokens = new_text.split()
|
||||
|
||||
if len(new_tokens) != len(old_tokens) + 1:
|
||||
return # unexpected change, skip
|
||||
|
||||
# Find the inserted token by walking both lists in parallel.
|
||||
# One token in new_tokens won't match — that's the inserted IPA.
|
||||
insert_idx = -1
|
||||
j = 0 # index into old_tokens
|
||||
for i in range(len(new_tokens)):
|
||||
if j < len(old_tokens) and new_tokens[i] == old_tokens[j]:
|
||||
j += 1
|
||||
else:
|
||||
insert_idx = i
|
||||
break
|
||||
|
||||
if insert_idx < 0 or insert_idx >= len(new_tokens):
|
||||
return
|
||||
|
||||
ipa_token = new_tokens[insert_idx]
|
||||
|
||||
# The headword is at insert_idx - 1 in old_tokens (and word_boxes)
|
||||
ref_idx = insert_idx - 1
|
||||
if ref_idx < 0 or ref_idx >= len(word_boxes):
|
||||
return
|
||||
|
||||
ref_box = word_boxes[ref_idx]
|
||||
ipa_box = {
|
||||
'text': ipa_token,
|
||||
'left': ref_box['left'] + ref_box['width'] + 2,
|
||||
'top': ref_box['top'],
|
||||
'width': ref_box['width'],
|
||||
'height': ref_box['height'],
|
||||
'conf': ref_box.get('conf', 90),
|
||||
}
|
||||
word_boxes.insert(insert_idx, ipa_box)
|
||||
File diff suppressed because it is too large
Load Diff
476
klausur-service/backend/cv_ocr_ipa_lookup.py
Normal file
476
klausur-service/backend/cv_ocr_ipa_lookup.py
Normal file
@@ -0,0 +1,476 @@
|
||||
"""
|
||||
IPA lookup and phonetic bracket handling for OCR-extracted vocabulary.
|
||||
|
||||
Tesseract and other OCR engines frequently garble IPA phonetic transcriptions
|
||||
in vocabulary tables (e.g. [ˈdɑːns] → {'tfatno] or (cy)). This module
|
||||
provides functions to:
|
||||
|
||||
- Look up correct IPA pronunciations (British/American) for English words.
|
||||
- Detect and replace garbled phonetic brackets with dictionary IPA.
|
||||
- Insert missing IPA for headwords where OCR destroyed the brackets entirely.
|
||||
- Strip orphan brackets and post-bracket garbled fragments.
|
||||
- Handle IPA continuation cells (phonetics on a separate row from headword).
|
||||
|
||||
All IPA data comes from open-source dictionaries:
|
||||
- Britfone (MIT) for British English
|
||||
- eng_to_ipa / CMU (MIT) for American English
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
IPA_AVAILABLE,
|
||||
_britfone_dict,
|
||||
_ipa_convert_american,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --- D. Phonetic Bracket IPA Replacement ---
|
||||
|
||||
# Pattern: word followed by any bracket type containing phonetic content.
|
||||
# Tesseract often garbles IPA brackets: [ˈdɑːns] → {'tfatno] or (cy) etc.
|
||||
# Match any opener ([, {, () with any closer (], }, )) — even mixed pairs.
|
||||
# This intentionally matches mixed brackets (e.g. {content]) because
|
||||
# Tesseract frequently misrecognizes bracket characters.
|
||||
_PHONETIC_BRACKET_RE = re.compile(
|
||||
r'(\b[a-zA-ZäöüÄÖÜß]+)\s*[\[\{\(]([^\]\}\)]*?)[\]\}\)]'
|
||||
)
|
||||
|
||||
# Unicode IPA characters — used to distinguish correct IPA (from dictionary
|
||||
# lookup) from garbled OCR content when stripping orphan brackets.
|
||||
_IPA_CHARS = frozenset('ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾʔɐ')
|
||||
|
||||
# Minimum word confidence for full-page Tesseract results (0-100).
|
||||
# Words below this threshold are OCR noise (scanner shadows, borders).
|
||||
_MIN_WORD_CONF = 30
|
||||
|
||||
|
||||
def _lookup_ipa(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Look up IPA for a word using the selected pronunciation dictionary.
|
||||
|
||||
Args:
|
||||
word: English word to look up.
|
||||
pronunciation: 'british' (Britfone, MIT) or 'american' (eng_to_ipa, MIT).
|
||||
|
||||
Returns:
|
||||
IPA string or None if not found.
|
||||
"""
|
||||
word_lower = word.lower().strip()
|
||||
if not word_lower:
|
||||
return None
|
||||
|
||||
if pronunciation == 'british' and _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
# Fallback to American if not in Britfone
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
return None
|
||||
|
||||
if pronunciation == 'american' and _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
# Fallback to Britfone if not in CMU
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
return None
|
||||
|
||||
# Try any available source
|
||||
if _britfone_dict:
|
||||
ipa = _britfone_dict.get(word_lower)
|
||||
if ipa:
|
||||
return ipa
|
||||
if _ipa_convert_american:
|
||||
result = _ipa_convert_american(word_lower)
|
||||
if result and '*' not in result:
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _fix_phonetic_brackets(
|
||||
entries: List[Dict[str, Any]],
|
||||
pronunciation: str = 'british',
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Replace OCR'd phonetic transcriptions with dictionary IPA.
|
||||
|
||||
Detects patterns like "dance [du:ns]" and replaces with correct IPA:
|
||||
- British: "dance [dˈɑːns]" (Britfone, MIT)
|
||||
- American: "dance [dæns]" (eng_to_ipa/CMU, MIT)
|
||||
|
||||
Only replaces if the word before brackets is found in the dictionary.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return entries
|
||||
|
||||
# IPA phonetics only appear in the ENGLISH field of vocab tables.
|
||||
# German and example fields contain meaningful parenthetical content:
|
||||
# german: "Eis (gefrorenes Wasser)", "(Salat-)Gurke", "sauer (auf)"
|
||||
# example: "(sich beschweren)", "(brauchen)", "(jammern)"
|
||||
# These must NEVER be processed as phonetic transcriptions.
|
||||
replaced_count = 0
|
||||
for entry in entries:
|
||||
text = entry.get('english', '') or ''
|
||||
if not any(ch in text for ch in '[{('):
|
||||
continue
|
||||
new_text = _replace_phonetics_in_text(text, pronunciation)
|
||||
if new_text != text:
|
||||
logger.debug(f"_fix_phonetic_brackets: '{text}' → '{new_text}'")
|
||||
replaced_count += 1
|
||||
entry['english'] = new_text
|
||||
|
||||
if replaced_count:
|
||||
logger.info(f"_fix_phonetic_brackets: {replaced_count} IPA replacements in {len(entries)} entries")
|
||||
return entries
|
||||
|
||||
|
||||
# Grammar particles that appear in brackets after English words:
|
||||
# cross (with), complain (about/of), agree (on/with), look (sth) up
|
||||
# These must NOT be replaced with IPA. Only used for the English field
|
||||
# (German/example fields are never processed for IPA replacement).
|
||||
_GRAMMAR_BRACKET_WORDS = frozenset({
|
||||
# English prepositions/particles commonly in vocab tables
|
||||
'with', 'about', 'of', 'for', 'to', 'from', 'in', 'on', 'at', 'by',
|
||||
'up', 'out', 'off', 'into', 'over', 'down', 'away', 'back', 'through',
|
||||
# English grammar abbreviations used in vocab tables
|
||||
'sth', 'sb', 'adj', 'adv',
|
||||
# Number/plural/grammar annotations
|
||||
'pl', 'sg', 'sing', 'no', 'also', 'auch',
|
||||
# Regional English markers
|
||||
'ae', 'be', 'ame', 'bre',
|
||||
})
|
||||
|
||||
|
||||
def _is_grammar_bracket_content(content: str) -> bool:
|
||||
"""Return True if bracket content is grammar info in the ENGLISH field.
|
||||
|
||||
Grammar info: cross (with), complain (about/of), agree (on/with)
|
||||
NOT grammar: [breik], [maus], {'tfatno], (cy), ['kju:kambo], [test]
|
||||
|
||||
Since we only process the English field, we only need to recognize
|
||||
English grammar particles. Everything else is (garbled) IPA.
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
# Split on / and spaces for patterns like (about/of), (no pl)
|
||||
tokens = re.split(r'[/\s]+', content.strip().lower())
|
||||
tokens = [t for t in tokens if t]
|
||||
if not tokens:
|
||||
return False
|
||||
|
||||
# ALL tokens must be known grammar words
|
||||
return all(token in _GRAMMAR_BRACKET_WORDS for token in tokens)
|
||||
|
||||
|
||||
def _replace_phonetics_in_text(
|
||||
text: str,
|
||||
pronunciation: str = 'british',
|
||||
strip_orphans: bool = True,
|
||||
) -> str:
|
||||
"""Replace [phonetic] / {phonetic} / (phonetic) after words with dictionary IPA.
|
||||
|
||||
Tesseract garbles IPA brackets, e.g. China [ˈtʃaɪnə] → China {'tfatno].
|
||||
We match any bracket type and replace with dictionary IPA if found.
|
||||
Legitimate parenthetical content like (zer)brechen or (veranstaltung) is preserved.
|
||||
|
||||
Args:
|
||||
strip_orphans: If True, strip orphan brackets that look like garbled IPA.
|
||||
Set to False for column_text where brackets may be German content.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
|
||||
def replacer(match):
|
||||
word = match.group(1)
|
||||
bracket_content = match.group(2).strip()
|
||||
full_match = match.group(0)
|
||||
|
||||
# Skip if bracket content looks like regular text (multiple words)
|
||||
if len(bracket_content.split()) > 3:
|
||||
return full_match
|
||||
|
||||
# Look up IPA for the word before brackets
|
||||
ipa = _lookup_ipa(word, pronunciation)
|
||||
|
||||
if ipa:
|
||||
# Word has IPA → bracket content is phonetic (garbled or correct).
|
||||
# Exception: grammar particles like cross (with) — keep those.
|
||||
if _is_grammar_bracket_content(bracket_content):
|
||||
return full_match
|
||||
logger.debug(f"phonetic: '{full_match}' → '{word} [{ipa}]'")
|
||||
return f"{word} [{ipa}]"
|
||||
|
||||
# No IPA for this word — keep as-is
|
||||
return full_match
|
||||
|
||||
text = _PHONETIC_BRACKET_RE.sub(replacer, text)
|
||||
|
||||
if strip_orphans:
|
||||
# Second pass: strip remaining orphan brackets that are garbled IPA.
|
||||
# These have no word before them (the main regex requires \b word \s* bracket).
|
||||
# Examples: "[mais]", "{'mani setva]", trailing "(kros]"
|
||||
# Keep: grammar parens "(sich beschweren)", correct IPA "[dˈɑːns]"
|
||||
def _strip_orphan_bracket(m):
|
||||
content = m.group(1).strip()
|
||||
# Keep grammar info: (sich beschweren), (about/of)
|
||||
if _is_grammar_bracket_content(content):
|
||||
return m.group(0)
|
||||
# Keep correct IPA (contains Unicode IPA characters)
|
||||
if any(ch in _IPA_CHARS for ch in content):
|
||||
return m.group(0)
|
||||
# Keep real-word parentheticals like (probieren), (Profit), (Geld).
|
||||
# Garbled IPA fragments are short nonsense like (kros), (cy), (mais)
|
||||
# — they never contain a real word ≥4 letters with proper casing.
|
||||
content_alpha = re.sub(r'[^a-zA-ZäöüÄÖÜßéèêëàâîïôûùç]', '', content)
|
||||
if len(content_alpha) >= 4:
|
||||
return m.group(0)
|
||||
logger.debug(f"phonetic: stripping orphan bracket '{m.group(0)}'")
|
||||
return ''
|
||||
|
||||
text = re.sub(r'[\[\{\(]([^\]\}\)]*)[\]\}\)]', _strip_orphan_bracket, text)
|
||||
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _text_has_garbled_ipa(text: str) -> bool:
|
||||
"""Check if text contains garbled IPA-like fragments from OCR.
|
||||
|
||||
Returns True if there is evidence of OCR-mangled phonetic
|
||||
transcription, e.g. stress marks, length marks, or IPA special chars.
|
||||
This is used to decide whether ``_insert_missing_ipa`` should run:
|
||||
it must only insert IPA to *replace* garbled phonetics that are already
|
||||
in the text — never to ADD phonetics where none existed on the page.
|
||||
"""
|
||||
# Bracketed text that doesn't contain valid IPA symbols is garbled OCR
|
||||
# of a phonetic transcription, e.g. "[n, nn]" or "[1uedtX,1]".
|
||||
stripped = text.strip()
|
||||
if stripped.startswith('[') and stripped.endswith(']'):
|
||||
inner = stripped[1:-1]
|
||||
# Real IPA brackets contain IPA symbols (ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ)
|
||||
if not any(c in inner for c in 'ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
# Not a valid dictionary-style bracket like "(no pl)" — those
|
||||
# use parentheses, not square brackets. Square brackets with
|
||||
# no IPA chars are garbled phonetics.
|
||||
return True
|
||||
|
||||
for w in text.strip().split():
|
||||
# Skip delimiters and very short tokens
|
||||
if len(w) <= 1 or w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
continue
|
||||
# Starts with stress mark (OCR read IPA stress ' as apostrophe)
|
||||
if w.startswith("'") and len(w) > 1 and not w[1:].istitle():
|
||||
return True
|
||||
if w.startswith("\u02c8") or w.startswith("\u02cc"): # ˈ ˌ
|
||||
return True
|
||||
# Contains IPA length mark ':' in a short non-word fragment
|
||||
if ':' in w and len(w) < 12:
|
||||
# But not things like "3:00" (time) or common words
|
||||
stripped = re.sub(r'[^a-zA-Z:]', '', w)
|
||||
if ':' in stripped and not stripped.replace(':', '').isalpha():
|
||||
continue
|
||||
return True
|
||||
# Contains IPA special characters
|
||||
if any(c in w for c in 'əɪɛɒʊʌæɑɔʃʒθðŋ'):
|
||||
return True
|
||||
# Embedded apostrophe suggesting merged garbled IPA with stress mark.
|
||||
# E.g. "Scotland'skotland" — OCR reads ˈ as '.
|
||||
# Guard: apostrophe must be after ≥3 chars and before ≥3 lowercase
|
||||
# chars to avoid contractions (don't, won't, o'clock).
|
||||
if "'" in w and not w.startswith("'"):
|
||||
apos_idx = w.index("'")
|
||||
after = w[apos_idx + 1:]
|
||||
if apos_idx >= 3 and len(after) >= 3 and after[0].islower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _decompose_compound(word: str, pronunciation: str = 'british') -> Optional[str]:
|
||||
"""Try to decompose a compound word and concatenate IPA for each part.
|
||||
|
||||
E.g. "schoolbag" → "school"+"bag" → IPA for both concatenated.
|
||||
Only returns IPA if ALL parts are found in the dictionary.
|
||||
|
||||
Tries splits at every position (min 3 chars per part) and picks the
|
||||
split where the first part is longest.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return None
|
||||
lower = word.lower().strip()
|
||||
if len(lower) < 6:
|
||||
return None # too short for a compound
|
||||
|
||||
best_ipa = None
|
||||
best_first_len = 0
|
||||
|
||||
for split_pos in range(3, len(lower) - 2): # min 3 chars each part
|
||||
first = lower[:split_pos]
|
||||
second = lower[split_pos:]
|
||||
ipa_first = _lookup_ipa(first, pronunciation)
|
||||
ipa_second = _lookup_ipa(second, pronunciation)
|
||||
if ipa_first and ipa_second:
|
||||
if split_pos > best_first_len:
|
||||
best_first_len = split_pos
|
||||
best_ipa = ipa_first + ipa_second
|
||||
|
||||
return best_ipa
|
||||
|
||||
|
||||
def _insert_missing_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA pronunciation for English words that have no brackets at all.
|
||||
|
||||
OCR sometimes garbles the phonetic transcription into plain-text fragments
|
||||
(e.g. "scare skea" where "skea" is garbled /skɛə/). This scans the text
|
||||
for the headword, inserts correct [IPA], and strips the garbled fragments.
|
||||
|
||||
Only inserts for words that:
|
||||
- are standalone (not already followed by a bracket)
|
||||
- have an IPA entry in the dictionary
|
||||
- appear to be English headwords (at the start of text or after common
|
||||
separators like ",", ";", "•")
|
||||
|
||||
This is intentionally conservative: it only inserts at the END of each
|
||||
whitespace-separated token group to avoid breaking phrases.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
# Skip if already has brackets (IPA replacement handles those)
|
||||
if any(ch in text for ch in '[{('):
|
||||
return text
|
||||
|
||||
# Only process short text fragments (typical vocab cells).
|
||||
# Long sentences / paragraphs should not get IPA insertions.
|
||||
words = text.strip().split()
|
||||
if len(words) > 6:
|
||||
return text
|
||||
|
||||
# Try to insert IPA for the first alphanumeric word
|
||||
# Typical patterns: "challenge", "profit", "film", "badge"
|
||||
for i, w in enumerate(words):
|
||||
# Clean punctuation for lookup
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
# Skip German/grammar words
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
# Fallback: try without hyphens (e.g. "second-hand" → "secondhand")
|
||||
if not ipa and '-' in clean:
|
||||
ipa = _lookup_ipa(clean.replace('-', ''), pronunciation)
|
||||
# Fallback 0b: compound word decomposition
|
||||
# E.g. "schoolbag" → "school"+"bag" → concatenated IPA
|
||||
if not ipa:
|
||||
ipa = _decompose_compound(clean, pronunciation)
|
||||
# Fallback 1: IPA-marker split for merged tokens where OCR
|
||||
# joined headword with its IPA (e.g. "schoolbagsku:lbæg").
|
||||
# Find the first IPA marker character (:, æ, ɪ, etc.), walk
|
||||
# backwards ≤3 chars for the onset consonant cluster, and
|
||||
# split into headword + OCR IPA.
|
||||
_IPA_SPLIT_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
if not ipa:
|
||||
first_marker = next(
|
||||
(p for p, ch in enumerate(w) if ch in _IPA_SPLIT_CHARS), -1,
|
||||
)
|
||||
if first_marker >= 3:
|
||||
split = first_marker
|
||||
while (split > 0
|
||||
and split > first_marker - 3
|
||||
and w[split - 1].isalpha()
|
||||
and w[split - 1].islower()):
|
||||
split -= 1
|
||||
if split >= 2:
|
||||
headword = w[:split]
|
||||
ocr_ipa = w[split:]
|
||||
hw_ipa = _lookup_ipa(headword, pronunciation)
|
||||
if not hw_ipa:
|
||||
# Try compound decomposition for the headword part
|
||||
hw_ipa = _decompose_compound(headword, pronunciation)
|
||||
if hw_ipa:
|
||||
words[i] = f"{headword} [{hw_ipa}]"
|
||||
else:
|
||||
# Word not in dictionary — use OCR IPA
|
||||
words[i] = f"{headword} [{ocr_ipa}]"
|
||||
words = words[:i + 1]
|
||||
ipa = True # signal that we handled it
|
||||
break
|
||||
# Fallback 2: prefix matching for merged tokens WITHOUT IPA
|
||||
# markers (e.g. "Scotland'skotland"). Find longest dictionary
|
||||
# prefix using only alpha chars to avoid punctuation matches.
|
||||
if not ipa:
|
||||
alpha = re.sub(r'[^a-zA-Z]', '', clean)
|
||||
if len(alpha) > 5: # need at least 6 chars for meaningful split
|
||||
for end in range(len(alpha), 3, -1): # min prefix 4 chars
|
||||
prefix = alpha[:end]
|
||||
test_ipa = _lookup_ipa(prefix, pronunciation)
|
||||
if test_ipa:
|
||||
ipa = test_ipa
|
||||
w = prefix
|
||||
words[i] = prefix
|
||||
break
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
# Strip garbled OCR phonetics after the IPA bracket.
|
||||
# On scanned vocab pages, printed IPA is read as garbled
|
||||
# text (e.g. "scare skea" where "skea" is garbled /skɛə/).
|
||||
# After inserting correct IPA, remove remaining words that
|
||||
# aren't real English words, delimiters, or German text.
|
||||
kept = words[:i + 1]
|
||||
for j in range(i + 1, len(words)):
|
||||
wj = words[j]
|
||||
# Delimiter — keep this and everything after
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — keep
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Starts with uppercase — likely German or proper noun
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Known English word (≥2 chars) — keep it and rest
|
||||
if clean_j and len(clean_j) >= 2:
|
||||
if _lookup_ipa(clean_j, pronunciation):
|
||||
kept.extend(words[j:])
|
||||
break
|
||||
# Merged token: dictionary word + garbled IPA stuck together.
|
||||
# E.g. "fictionsalans'fIkfn" starts with "fiction".
|
||||
# Extract the dictionary prefix (≥4 chars) and add it with
|
||||
# IPA, but only if enough chars remain after the prefix (≥3)
|
||||
# to look like garbled IPA, not just a plural 's'.
|
||||
if clean_j and len(clean_j) >= 7:
|
||||
for pend in range(min(len(clean_j) - 3, 15), 3, -1):
|
||||
prefix_j = clean_j[:pend]
|
||||
prefix_ipa = _lookup_ipa(prefix_j, pronunciation)
|
||||
if prefix_ipa:
|
||||
kept.append(f"{prefix_j} [{prefix_ipa}]")
|
||||
break
|
||||
break # rest of this token is garbled
|
||||
# Otherwise — likely garbled phonetics, skip
|
||||
words = kept
|
||||
break
|
||||
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
287
klausur-service/backend/cv_ocr_ipa_repair.py
Normal file
287
klausur-service/backend/cv_ocr_ipa_repair.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
Advanced IPA repair for OCR-extracted vocabulary.
|
||||
|
||||
Functions that detect and fix garbled IPA fragments trailing after
|
||||
headwords or in continuation cells. Split from cv_ocr_ipa_lookup.py
|
||||
to stay within the 500 LOC budget.
|
||||
|
||||
Contains:
|
||||
- _has_non_dict_trailing: detect non-dictionary trailing words
|
||||
- _strip_post_bracket_garbled: strip garbled IPA after [brackets]
|
||||
- fix_ipa_continuation_cell: replace garbled IPA in continuation rows
|
||||
- _insert_headword_ipa: insert IPA for first headword in mixed-lang lines
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import IPA_AVAILABLE
|
||||
from cv_ocr_ipa_lookup import (
|
||||
_lookup_ipa,
|
||||
_GRAMMAR_BRACKET_WORDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _has_non_dict_trailing(text: str, pronunciation: str = 'british') -> bool:
|
||||
"""Check if text has a headword followed by non-dictionary trailing words.
|
||||
|
||||
Used as an additional trigger for ``_insert_missing_ipa`` when
|
||||
``_text_has_garbled_ipa`` returns False because the garbled IPA
|
||||
happens to look like plain ASCII (e.g. "skea" for /skɛə/).
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return False
|
||||
words = text.strip().split()
|
||||
if len(words) < 2 or len(words) > 6:
|
||||
return False
|
||||
# Find first dictionary word
|
||||
hw_idx = -1
|
||||
for i, w in enumerate(words):
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
if _lookup_ipa(clean, pronunciation):
|
||||
hw_idx = i
|
||||
break
|
||||
if hw_idx < 0 or hw_idx >= len(words) - 1:
|
||||
return False
|
||||
# Check ALL remaining words — if none are dictionary/delimiter/German,
|
||||
# they are likely garbled IPA.
|
||||
for j in range(hw_idx + 1, len(words)):
|
||||
wj = words[j]
|
||||
if wj in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
return False
|
||||
# Pure digits or numbering (e.g. "1", "2.", "3)") — not garbled IPA
|
||||
if re.match(r'^[\d.)\-]+$', wj):
|
||||
return False
|
||||
clean_j = re.sub(r'[^a-zA-Z]', '', wj)
|
||||
if clean_j and clean_j[0].isupper():
|
||||
return False
|
||||
if clean_j and len(clean_j) >= 2 and _lookup_ipa(clean_j, pronunciation):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _strip_post_bracket_garbled(
|
||||
text: str, pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Strip garbled IPA fragments that trail after proper [IPA] brackets.
|
||||
|
||||
E.g. ``sea [sˈiː] si:`` → ``sea [sˈiː]``
|
||||
``seat [sˈiːt] si:t`` → ``seat [sˈiːt]``
|
||||
``seat [sˈiːt] belt si:t belt`` → ``seat [sˈiːt] belt``
|
||||
|
||||
For multi-word headwords like "seat belt", a real English word ("belt")
|
||||
may be followed by garbled IPA duplicates. We detect this by checking
|
||||
whether the sequence after a real word contains IPA markers (`:`, `ə`,
|
||||
etc.) — if so, everything from the first garbled token onward is stripped.
|
||||
"""
|
||||
if ']' not in text:
|
||||
return text
|
||||
last_bracket = text.rfind(']')
|
||||
if last_bracket >= len(text) - 1:
|
||||
return text
|
||||
before = text[:last_bracket + 1].rstrip()
|
||||
after = text[last_bracket + 1:].strip()
|
||||
if not after:
|
||||
return text
|
||||
|
||||
_IPA_MARKER_CHARS = set(':əɪɛɒʊʌæɑɔʃʒθðŋˈˌ')
|
||||
after_words = after.split()
|
||||
kept: List[str] = []
|
||||
for idx, w in enumerate(after_words):
|
||||
# Delimiter — keep rest
|
||||
if w in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Contains IPA markers (length mark, IPA chars) — garbled, skip
|
||||
if any(c in w for c in _IPA_MARKER_CHARS):
|
||||
# Everything from here is garbled IPA — stop scanning
|
||||
# but look ahead: if any remaining words are real English
|
||||
# words WITHOUT IPA markers, they might be a different headword
|
||||
# following. Only skip the contiguous garbled run.
|
||||
continue
|
||||
clean = re.sub(r'[^a-zA-Z]', '', w)
|
||||
# Uppercase — likely German, keep rest
|
||||
if clean and clean[0].isupper():
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Known English word — keep it, but check if followed by garbled IPA
|
||||
# (multi-word headword case like "seat [siːt] belt si:t belt")
|
||||
if clean and len(clean) >= 2 and _lookup_ipa(clean, pronunciation):
|
||||
# Peek ahead: if next word has IPA markers, the rest is garbled
|
||||
remaining = after_words[idx + 1:]
|
||||
has_garbled_after = any(
|
||||
any(c in rw for c in _IPA_MARKER_CHARS)
|
||||
for rw in remaining
|
||||
)
|
||||
if has_garbled_after:
|
||||
# Keep this real word but stop — rest is garbled duplication
|
||||
kept.append(w)
|
||||
# Still scan for delimiters/German in the remaining words
|
||||
for ridx, rw in enumerate(remaining):
|
||||
if rw in ('–', '—', '-', '/', '|', ',', ';'):
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
rclean = re.sub(r'[^a-zA-Z]', '', rw)
|
||||
if rclean and rclean[0].isupper():
|
||||
kept.extend(remaining[ridx:])
|
||||
break
|
||||
break
|
||||
else:
|
||||
kept.extend(after_words[idx:])
|
||||
break
|
||||
# Unknown short word — likely garbled, skip
|
||||
if kept:
|
||||
return before + ' ' + ' '.join(kept)
|
||||
return before
|
||||
|
||||
|
||||
def fix_ipa_continuation_cell(
|
||||
garbled_text: str,
|
||||
headword_text: str,
|
||||
pronunciation: str = 'british',
|
||||
) -> str:
|
||||
"""Replace garbled IPA in a continuation row with proper IPA.
|
||||
|
||||
Continuation rows appear below the headword and contain only the
|
||||
printed phonetic transcription, which OCR garbles into fragments
|
||||
like ``ska:f – ska:vz`` (should be ``[skˈɑːf] – [skˈɑːvz]``).
|
||||
|
||||
Args:
|
||||
garbled_text: The OCR-garbled IPA text from the continuation row.
|
||||
headword_text: The headword text from the previous row
|
||||
(e.g. ``scarf – scarves``).
|
||||
pronunciation: ``'british'`` or ``'american'``.
|
||||
|
||||
Returns:
|
||||
Corrected IPA text, or the original if no fix could be applied.
|
||||
"""
|
||||
if not IPA_AVAILABLE or not garbled_text or not headword_text:
|
||||
return garbled_text
|
||||
|
||||
# If headword already has inline IPA like "beat [bˈiːt] , beat, beaten",
|
||||
# only generate continuation IPA for words NOT already covered.
|
||||
covered_words: set = set()
|
||||
has_inline_ipa = bool(re.search(r'\[[^\]]*\]', headword_text))
|
||||
if has_inline_ipa:
|
||||
# Words before the first bracket already have their IPA shown
|
||||
first_bracket = headword_text.index('[')
|
||||
pre_bracket = headword_text[:first_bracket].strip()
|
||||
for w in pre_bracket.split():
|
||||
clean = re.sub(r'[^a-zA-Z\'-]', '', w).lower()
|
||||
if clean and len(clean) >= 2:
|
||||
covered_words.add(clean)
|
||||
|
||||
last_bracket_end = headword_text.rfind(']')
|
||||
tail = headword_text[last_bracket_end + 1:].strip()
|
||||
|
||||
if not tail or not re.search(r'[a-zA-Z]{2,}', tail):
|
||||
# Bracket is at the end (e.g. "the Highlands [ˈhaɪləndz]")
|
||||
# — return the inline IPA directly (continuation duplicates it)
|
||||
last_bracket_start = headword_text.rfind('[')
|
||||
inline_ipa = headword_text[last_bracket_start:last_bracket_end + 1]
|
||||
return inline_ipa
|
||||
|
||||
# Only the tail words need continuation IPA
|
||||
headword_text = tail
|
||||
|
||||
# Strip existing IPA brackets and parenthetical grammar annotations
|
||||
# like "(no pl)", "(sth)", "(sb)" from headword text
|
||||
clean_hw = re.sub(r'\[[^\]]*\]', '', headword_text)
|
||||
clean_hw = re.sub(r'\([^)]*\)', '', clean_hw).strip()
|
||||
if not clean_hw:
|
||||
return garbled_text
|
||||
|
||||
# Split headword by delimiters (– — -)
|
||||
# "scarf – scarves" → ["scarf", "scarves"]
|
||||
# "see - saw - seen" → ["see", "saw", "seen"]
|
||||
parts = re.split(r'\s*[–—]\s*|\s+-\s+', clean_hw)
|
||||
parts = [p.strip() for p in parts if p.strip()]
|
||||
|
||||
if not parts:
|
||||
return garbled_text
|
||||
|
||||
# Look up IPA for each headword part.
|
||||
# Skip articles (the, a, an) — they never get IPA in vocab books.
|
||||
# Other function words like "down", "up" are kept because they are
|
||||
# integral parts of phrasal verbs (e.g. "close down").
|
||||
# Skip words that already have inline IPA in the headword row.
|
||||
_ARTICLES = {'the', 'a', 'an'}
|
||||
ipa_parts: List[str] = []
|
||||
for part in parts:
|
||||
# A part may be multi-word like "secondary school"
|
||||
words = part.split()
|
||||
word_ipas: List[str] = []
|
||||
for w in words:
|
||||
clean_w = re.sub(r'[^a-zA-Z\'-]', '', w)
|
||||
if not clean_w or len(clean_w) < 2:
|
||||
continue
|
||||
if covered_words and clean_w.lower() in covered_words:
|
||||
continue # Already has IPA inline in the headword
|
||||
if clean_w.lower() in _ARTICLES:
|
||||
continue # Articles never get IPA in vocab books
|
||||
ipa = _lookup_ipa(clean_w, pronunciation)
|
||||
if ipa:
|
||||
word_ipas.append(ipa)
|
||||
if word_ipas:
|
||||
ipa_parts.append('[' + ' '.join(word_ipas) + ']')
|
||||
|
||||
if not ipa_parts:
|
||||
return garbled_text
|
||||
|
||||
# Join with delimiter
|
||||
result = ' – '.join(ipa_parts)
|
||||
logger.debug(
|
||||
"fix_ipa_continuation: '%s' → '%s' (headwords: '%s')",
|
||||
garbled_text, result, headword_text,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def _insert_headword_ipa(text: str, pronunciation: str = 'british') -> str:
|
||||
"""Insert IPA for the first English headword in a long mixed-language line.
|
||||
|
||||
Unlike _insert_missing_ipa (for short column_en cells), this handles
|
||||
column_text lines of any length. It only inserts IPA for the FIRST word
|
||||
if that word:
|
||||
- has no bracket following it already
|
||||
- has an IPA entry in the dictionary
|
||||
- is not a number/symbol prefix like "».55"
|
||||
|
||||
Returns the text with [ipa] inserted after the first word, or unchanged.
|
||||
"""
|
||||
if not IPA_AVAILABLE:
|
||||
return text
|
||||
if not text or not text.strip():
|
||||
return text
|
||||
|
||||
words = text.strip().split()
|
||||
if not words:
|
||||
return text
|
||||
|
||||
# Check if text already starts with a bracket (IPA already present)
|
||||
if len(words) > 1 and words[1].startswith(('[', '{', '(')):
|
||||
return text
|
||||
|
||||
# Try the first few words (skip numeric prefixes like "».55", "0.56")
|
||||
for i in range(min(3, len(words))):
|
||||
w = words[i]
|
||||
clean = re.sub(r'[^a-zA-ZäöüÄÖÜß\'-]', '', w)
|
||||
if not clean or len(clean) < 2:
|
||||
continue
|
||||
if clean.lower() in _GRAMMAR_BRACKET_WORDS:
|
||||
continue
|
||||
ipa = _lookup_ipa(clean, pronunciation)
|
||||
if ipa:
|
||||
words[i] = f"{w} [{ipa}]"
|
||||
return ' '.join(words)
|
||||
# Stop at first real word even if no IPA found
|
||||
break
|
||||
|
||||
return text
|
||||
318
klausur-service/backend/cv_ocr_vocab_postprocess.py
Normal file
318
klausur-service/backend/cv_ocr_vocab_postprocess.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Vocab postprocessing: deterministic quality fixes for OCR-extracted vocabulary.
|
||||
|
||||
- Character confusion fix (I/1/l/|)
|
||||
- Comma-separated word form splitting
|
||||
- Example sentence attachment to matching vocab entries
|
||||
|
||||
Split from cv_ocr_engines.py for maintainability.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Post-Processing: Deterministic Quality Fixes
|
||||
# =============================================================================
|
||||
|
||||
# --- A. Character Confusion Fix (I/1/l) ---
|
||||
|
||||
# Common OCR confusion pairs in vocabulary context
|
||||
_CHAR_CONFUSION_RULES = [
|
||||
# "1" at word start followed by lowercase → likely "I" or "l"
|
||||
# Exception: NOT before "." or "," (numbered list prefix: "1. Kreuz", "1, 2, 3")
|
||||
(re.compile(r'\b1([a-z])'), r'I\1'), # 1ch → Ich, 1want → Iwant
|
||||
# Standalone "1" → "I" (English pronoun), but NOT "1." or "1," (list number)
|
||||
(re.compile(r'(?<!\d)\b1\b(?![\d.,])'), 'I'), # "1 want" → "I want"
|
||||
# "|" → "I", but NOT when embedded between letters (syllable divider: Ka|me|rad)
|
||||
# and NOT "|." or "|," (those are "1." list prefixes → spell-checker handles them)
|
||||
(re.compile(r'(?<![a-zA-ZäöüÄÖÜß])\|(?!\||[.,])'), 'I'), # |ch → Ich, | want → I want
|
||||
]
|
||||
|
||||
# Cross-language indicators: if DE has these, EN "1" is almost certainly "I"
|
||||
_DE_INDICATORS_FOR_EN_I = {'ich', 'mich', 'mir', 'mein', 'meine', 'meiner', 'meinem'}
|
||||
|
||||
|
||||
def _fix_character_confusion(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Fix common OCR character confusions using context.
|
||||
|
||||
Deterministic rules:
|
||||
- "1" at word start → "I" or "l" based on context
|
||||
- Cross-reference EN↔DE: if DE contains "ich/mich/mir", EN "1" → "I"
|
||||
- "y " artifact at word boundaries → remove (e.g. "y you" → "you")
|
||||
"""
|
||||
for entry in entries:
|
||||
en = entry.get('english', '') or ''
|
||||
de = entry.get('german', '') or ''
|
||||
ex = entry.get('example', '') or ''
|
||||
|
||||
# Apply general rules to all fields
|
||||
for pattern, replacement in _CHAR_CONFUSION_RULES:
|
||||
en = pattern.sub(replacement, en)
|
||||
de = pattern.sub(replacement, de)
|
||||
ex = pattern.sub(replacement, ex)
|
||||
|
||||
# Cross-reference: if DE has "ich"/"mich" indicators, fix EN "1" → "I"
|
||||
de_lower_words = set(de.lower().replace(',', ' ').split())
|
||||
if de_lower_words & _DE_INDICATORS_FOR_EN_I:
|
||||
# Any remaining "1" in EN that looks like "I"
|
||||
en = re.sub(r'\b1\b(?![\d.,])', 'I', en)
|
||||
|
||||
# Fix "y " artifact before repeated word: "y you" → "you"
|
||||
en = re.sub(r'\by\s+([a-z])', r'\1', en)
|
||||
ex = re.sub(r'\by\s+([a-z])', r'\1', ex)
|
||||
|
||||
entry['english'] = en.strip()
|
||||
entry['german'] = de.strip()
|
||||
entry['example'] = ex.strip()
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
# --- B. Comma-Separated Word Form Splitting ---
|
||||
|
||||
def _is_singular_plural_pair(parts: List[str]) -> bool:
|
||||
"""Detect if comma-separated parts are singular/plural forms of the same word.
|
||||
|
||||
E.g. "mouse, mice" or "Maus, Mäuse" → True (should NOT be split).
|
||||
"break, broke, broken" → False (different verb forms, OK to split).
|
||||
|
||||
Heuristic: exactly 2 parts that share a common prefix of >= 50% length,
|
||||
OR one part is a known plural suffix of the other (e.g. +s, +es, +en).
|
||||
"""
|
||||
if len(parts) != 2:
|
||||
return False
|
||||
|
||||
a, b = parts[0].lower().strip(), parts[1].lower().strip()
|
||||
if not a or not b:
|
||||
return False
|
||||
|
||||
# Common prefix heuristic: if words share >= 50% of the shorter word,
|
||||
# they are likely forms of the same word (Maus/Mäuse, child/children).
|
||||
min_len = min(len(a), len(b))
|
||||
common = 0
|
||||
for ca, cb in zip(a, b):
|
||||
if ca == cb:
|
||||
common += 1
|
||||
else:
|
||||
break
|
||||
if common >= max(2, min_len * 0.5):
|
||||
return True
|
||||
|
||||
# Umlaut relation: one form adds umlaut (a→ä, o→ö, u→ü)
|
||||
umlaut_map = str.maketrans('aou', 'äöü')
|
||||
if a.translate(umlaut_map) == b or b.translate(umlaut_map) == a:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _split_comma_entries(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Split entries with comma-separated word forms into individual entries.
|
||||
|
||||
E.g. EN: "break, broke, broken" / DE: "brechen, brach, gebrochen"
|
||||
→ 3 entries: break/brechen, broke/brach, broken/gebrochen
|
||||
|
||||
Does NOT split singular/plural pairs like "mouse, mice" / "Maus, Mäuse"
|
||||
because those are forms of the same vocabulary entry.
|
||||
|
||||
Only splits when both EN and DE have the same number of comma-parts,
|
||||
parts are short (word forms, not sentences), and at least 3 parts
|
||||
(to avoid splitting pairs that likely belong together).
|
||||
"""
|
||||
result: List[Dict[str, Any]] = []
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
|
||||
# Split by comma (but not inside brackets or parentheses)
|
||||
en_parts = _split_by_comma(en)
|
||||
de_parts = _split_by_comma(de)
|
||||
|
||||
# Only split if we have multiple parts and counts match
|
||||
should_split = False
|
||||
if len(en_parts) > 1 and len(de_parts) > 1 and len(en_parts) == len(de_parts):
|
||||
# All parts must be short (word forms, not sentences)
|
||||
if all(len(p.split()) <= 3 for p in en_parts) and all(len(p.split()) <= 3 for p in de_parts):
|
||||
# Do NOT split singular/plural pairs (2 parts that are
|
||||
# forms of the same word)
|
||||
if _is_singular_plural_pair(en_parts) or _is_singular_plural_pair(de_parts):
|
||||
should_split = False
|
||||
else:
|
||||
should_split = True
|
||||
|
||||
if not should_split:
|
||||
result.append(entry)
|
||||
continue
|
||||
|
||||
# Split into individual entries
|
||||
for k in range(len(en_parts)):
|
||||
sub = dict(entry) # shallow copy
|
||||
sub['english'] = en_parts[k].strip()
|
||||
sub['german'] = de_parts[k].strip() if k < len(de_parts) else ''
|
||||
sub['example'] = '' # examples get attached later
|
||||
sub['split_from_comma'] = True
|
||||
result.append(sub)
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(result):
|
||||
e['row_index'] = i
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _split_by_comma(text: str) -> List[str]:
|
||||
"""Split text by commas, but not inside brackets [...] or parens (...)."""
|
||||
if ',' not in text:
|
||||
return [text]
|
||||
|
||||
parts = []
|
||||
depth_bracket = 0
|
||||
depth_paren = 0
|
||||
current = []
|
||||
|
||||
for ch in text:
|
||||
if ch == '[':
|
||||
depth_bracket += 1
|
||||
elif ch == ']':
|
||||
depth_bracket = max(0, depth_bracket - 1)
|
||||
elif ch == '(':
|
||||
depth_paren += 1
|
||||
elif ch == ')':
|
||||
depth_paren = max(0, depth_paren - 1)
|
||||
elif ch == ',' and depth_bracket == 0 and depth_paren == 0:
|
||||
parts.append(''.join(current).strip())
|
||||
current = []
|
||||
continue
|
||||
current.append(ch)
|
||||
|
||||
if current:
|
||||
parts.append(''.join(current).strip())
|
||||
|
||||
# Filter empty parts
|
||||
return [p for p in parts if p]
|
||||
|
||||
|
||||
# --- C. Example Sentence Attachment ---
|
||||
|
||||
def _find_best_vocab_match(example_text: str, vocab_entries: List[Dict[str, Any]]) -> int:
|
||||
"""Find the vocab entry whose English word(s) best match the example sentence.
|
||||
|
||||
Returns index into vocab_entries, or -1 if no match found.
|
||||
Uses word stem overlap: "a broken arm" matches "broken" or "break".
|
||||
"""
|
||||
if not vocab_entries or not example_text:
|
||||
return -1
|
||||
|
||||
example_lower = example_text.lower()
|
||||
example_words = set(re.findall(r'[a-zäöüß]+', example_lower))
|
||||
|
||||
best_idx = -1
|
||||
best_score = 0
|
||||
|
||||
for i, entry in enumerate(vocab_entries):
|
||||
en = (entry.get('english', '') or '').lower()
|
||||
if not en:
|
||||
continue
|
||||
|
||||
# Extract vocab words (split on space, comma, newline)
|
||||
vocab_words = set(re.findall(r'[a-zäöüß]+', en))
|
||||
|
||||
# Score: how many vocab words appear in the example?
|
||||
# Also check if example words share a common stem (first 4 chars)
|
||||
direct_matches = vocab_words & example_words
|
||||
score = len(direct_matches) * 10
|
||||
|
||||
# Stem matching: "broken" matches "break" via shared prefix "bro"/"bre"
|
||||
if score == 0:
|
||||
for vw in vocab_words:
|
||||
if len(vw) < 3:
|
||||
continue
|
||||
stem = vw[:4] if len(vw) >= 4 else vw[:3]
|
||||
for ew in example_words:
|
||||
if len(ew) >= len(stem) and ew[:len(stem)] == stem:
|
||||
score += 5
|
||||
break
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_idx = i
|
||||
|
||||
return best_idx if best_score > 0 else -1
|
||||
|
||||
|
||||
def _attach_example_sentences(entries: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Attach rows with EN text but no DE translation as examples to matching vocab entries.
|
||||
|
||||
Vocabulary worksheets often have:
|
||||
Row 1: break, broke, broken / brechen, brach, gebrochen
|
||||
Row 2: a broken arm (no DE → example for "broken")
|
||||
Row 3: a broken plate (no DE → example for "broken")
|
||||
Row 4: egg / Ei (has DE → new vocab entry)
|
||||
|
||||
Rules (deterministic, generic):
|
||||
- A row is an "example row" if it has EN text but NO DE text (or very short DE ≤2 chars)
|
||||
- Find the best matching vocab entry by checking which entry's English words
|
||||
appear in the example sentence (semantic matching via word overlap)
|
||||
- Fall back to the nearest preceding entry if no word match found
|
||||
- Multiple examples get joined with " | "
|
||||
"""
|
||||
if not entries:
|
||||
return entries
|
||||
|
||||
# Separate into vocab entries (have DE) and example candidates (no DE)
|
||||
vocab_entries: List[Dict[str, Any]] = []
|
||||
examples_for: Dict[int, List[str]] = {} # vocab_index → list of example texts
|
||||
|
||||
for entry in entries:
|
||||
en = (entry.get('english', '') or '').strip()
|
||||
de = (entry.get('german', '') or '').strip()
|
||||
ex = (entry.get('example', '') or '').strip()
|
||||
|
||||
# Treat single-char DE as OCR noise, not real translation.
|
||||
# "Ei" (2 chars) is a valid German word, so threshold is 1.
|
||||
has_de = len(de) > 1
|
||||
has_en = bool(en)
|
||||
|
||||
# Heuristic: a row without DE is an "example sentence" only if
|
||||
# the EN text looks like a sentence (>= 4 words, or contains
|
||||
# typical sentence punctuation). Short EN text (1-3 words) is
|
||||
# more likely a vocab entry whose DE was missed by OCR.
|
||||
_looks_like_sentence = (
|
||||
len(en.split()) >= 4
|
||||
or en.rstrip().endswith(('.', '!', '?'))
|
||||
)
|
||||
is_example_candidate = (
|
||||
has_en and not has_de and _looks_like_sentence and vocab_entries
|
||||
)
|
||||
|
||||
if is_example_candidate:
|
||||
# This is an example sentence — find best matching vocab entry
|
||||
example_text = en
|
||||
|
||||
match_idx = _find_best_vocab_match(en, vocab_entries)
|
||||
if match_idx < 0:
|
||||
# No word match → fall back to last entry
|
||||
match_idx = len(vocab_entries) - 1
|
||||
|
||||
if match_idx not in examples_for:
|
||||
examples_for[match_idx] = []
|
||||
examples_for[match_idx].append(example_text)
|
||||
else:
|
||||
vocab_entries.append(entry)
|
||||
|
||||
# Attach examples to their matched vocab entries
|
||||
for idx, example_list in examples_for.items():
|
||||
if 0 <= idx < len(vocab_entries):
|
||||
entry = vocab_entries[idx]
|
||||
existing_ex = (entry.get('example', '') or '').strip()
|
||||
new_examples = ' | '.join(example_list)
|
||||
entry['example'] = f"{existing_ex} | {new_examples}" if existing_ex else new_examples
|
||||
|
||||
# Re-number
|
||||
for i, e in enumerate(vocab_entries):
|
||||
e['row_index'] = i
|
||||
|
||||
return vocab_entries
|
||||
134
klausur-service/backend/cv_ocr_word_assembly.py
Normal file
134
klausur-service/backend/cv_ocr_word_assembly.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Word assembly helpers for OCR output.
|
||||
|
||||
Groups raw OCR word dicts (with 'top', 'left', 'width', 'text' keys)
|
||||
into visual lines, rejoins hyphenated words, and produces reading-order
|
||||
text. All functions are pure standard-library; no NumPy or project
|
||||
imports required.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _group_words_into_lines(words: List[Dict], y_tolerance_px: int = 20) -> List[List[Dict]]:
|
||||
"""Group words by Y position into lines, sorted by X within each line."""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: (w['top'], w['left']))
|
||||
lines: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [sorted_words[0]]
|
||||
current_y = sorted_words[0]['top']
|
||||
|
||||
for word in sorted_words[1:]:
|
||||
if abs(word['top'] - current_y) <= y_tolerance_px:
|
||||
current_line.append(word)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
current_line = [word]
|
||||
current_y = word['top']
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _words_to_reading_order_lines(words: List[Dict], y_tolerance_px: int = 15) -> List[str]:
|
||||
"""Group OCR words into visual lines in reading order.
|
||||
|
||||
Returns a list of line strings (one per visual line in the cell).
|
||||
"""
|
||||
if not words:
|
||||
return []
|
||||
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
return [' '.join(w['text'] for w in line) for line in lines]
|
||||
|
||||
|
||||
def _rejoin_hyphenated(lines: List[str]) -> List[str]:
|
||||
"""Rejoin words split by line-break hyphenation.
|
||||
|
||||
E.g. ['Fu\u00df-', 'boden'] \u2192 ['Fu\u00dfboden']
|
||||
['some text-', 'thing here'] \u2192 ['something here']
|
||||
"""
|
||||
if len(lines) <= 1:
|
||||
return lines
|
||||
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
# If line ends with '-' and there's a next line, rejoin
|
||||
if i + 1 < len(lines) and line.rstrip().endswith('-'):
|
||||
stripped = line.rstrip()
|
||||
# Get the word fragment before hyphen (last word)
|
||||
prefix = stripped[:-1] # remove trailing hyphen
|
||||
next_line = lines[i + 1]
|
||||
# Join: last word of this line + first word of next line
|
||||
prefix_words = prefix.rsplit(' ', 1)
|
||||
next_words = next_line.split(' ', 1)
|
||||
if len(prefix_words) > 1:
|
||||
joined = prefix_words[0] + ' ' + prefix_words[1] + next_words[0]
|
||||
else:
|
||||
joined = prefix_words[0] + next_words[0]
|
||||
remainder = next_words[1] if len(next_words) > 1 else ''
|
||||
if remainder:
|
||||
result.append(joined + ' ' + remainder)
|
||||
else:
|
||||
result.append(joined)
|
||||
i += 2
|
||||
else:
|
||||
result.append(line)
|
||||
i += 1
|
||||
return result
|
||||
|
||||
|
||||
def _words_to_reading_order_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words into text in correct reading order, preserving line breaks.
|
||||
|
||||
Groups words into visual lines by Y-tolerance, sorts each line by X,
|
||||
rejoins hyphenated words, then joins lines with newlines.
|
||||
"""
|
||||
lines = _words_to_reading_order_lines(words, y_tolerance_px)
|
||||
lines = _rejoin_hyphenated(lines)
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _words_to_spaced_text(words: List[Dict], y_tolerance_px: int = 15) -> str:
|
||||
"""Join OCR words preserving proportional horizontal spacing.
|
||||
|
||||
Instead of single spaces between words, inserts multiple spaces based on
|
||||
the pixel gap between words relative to average character width.
|
||||
Useful for box sub-sessions where spatial layout matters.
|
||||
"""
|
||||
lines = _group_words_into_lines(words, y_tolerance_px=y_tolerance_px)
|
||||
result_lines = []
|
||||
|
||||
for line_words in lines:
|
||||
if not line_words:
|
||||
continue
|
||||
sorted_words = sorted(line_words, key=lambda w: w['left'])
|
||||
|
||||
# Calculate average character width from all words in line
|
||||
total_chars = sum(len(w['text']) for w in sorted_words if w.get('text'))
|
||||
total_width = sum(w['width'] for w in sorted_words if w.get('text'))
|
||||
avg_char_width = total_width / total_chars if total_chars > 0 else 10
|
||||
|
||||
parts = []
|
||||
for i, word in enumerate(sorted_words):
|
||||
parts.append(word.get('text', ''))
|
||||
if i < len(sorted_words) - 1:
|
||||
next_word = sorted_words[i + 1]
|
||||
gap_px = next_word['left'] - (word['left'] + word['width'])
|
||||
num_spaces = max(1, round(gap_px / avg_char_width))
|
||||
parts.append(' ' * num_spaces)
|
||||
|
||||
result_lines.append(''.join(parts))
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
140
klausur-service/backend/dsfa_chunking.py
Normal file
140
klausur-service/backend/dsfa_chunking.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
DSFA Chunking — Text chunking strategies for document ingestion.
|
||||
|
||||
Contains:
|
||||
- chunk_text_recursive: Recursive chunking with overlap
|
||||
- chunk_by_sections: Section-marker-based chunking
|
||||
- chunk_by_list_items: List-item-based chunking
|
||||
- chunk_document: Strategy router
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict
|
||||
|
||||
from dsfa_sources_registry import DSFA_CHUNK_CONFIG
|
||||
|
||||
|
||||
def chunk_text_recursive(text: str, max_size: int = 1000, overlap: int = 200) -> List[Dict]:
|
||||
"""Recursively chunk text with overlap."""
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = min(start + max_size, len(text))
|
||||
|
||||
# Find a good break point (sentence end, paragraph)
|
||||
if end < len(text):
|
||||
for sep in ["\n\n", "\n", ". ", ", ", " "]:
|
||||
last_sep = text[start:end].rfind(sep)
|
||||
if last_sep > max_size // 2:
|
||||
end = start + last_sep + len(sep)
|
||||
break
|
||||
|
||||
chunk_text = text[start:end].strip()
|
||||
if chunk_text:
|
||||
chunks.append({
|
||||
"content": chunk_text,
|
||||
"start_char": start,
|
||||
"end_char": end
|
||||
})
|
||||
|
||||
start = end - overlap if end < len(text) else len(text)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_by_sections(text: str, markers: List[str], max_size: int = 1500, overlap: int = 200) -> List[Dict]:
|
||||
"""Chunk text by section markers."""
|
||||
chunks = []
|
||||
pattern = "|".join(f"({m})" for m in markers)
|
||||
|
||||
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
|
||||
|
||||
if not matches:
|
||||
return chunk_text_recursive(text, max_size, overlap)
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
start = match.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||
|
||||
section_text = text[start:end].strip()
|
||||
section_title = match.group(0).strip()
|
||||
|
||||
if len(section_text) > max_size:
|
||||
sub_chunks = chunk_text_recursive(section_text, max_size, overlap)
|
||||
for j, sub in enumerate(sub_chunks):
|
||||
chunks.append({
|
||||
"content": sub["content"],
|
||||
"section_title": section_title if j == 0 else f"{section_title} (cont.)",
|
||||
"start_char": start + sub["start_char"],
|
||||
"end_char": start + sub["end_char"]
|
||||
})
|
||||
else:
|
||||
chunks.append({
|
||||
"content": section_text,
|
||||
"section_title": section_title,
|
||||
"start_char": start,
|
||||
"end_char": end
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_by_list_items(text: str, markers: List[str], max_size: int = 800) -> List[Dict]:
|
||||
"""Chunk text by list item markers."""
|
||||
chunks = []
|
||||
pattern = "|".join(f"({m})" for m in markers)
|
||||
|
||||
lines = text.split("\n")
|
||||
current_item = ""
|
||||
current_start = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if re.match(pattern, line.strip()):
|
||||
if current_item.strip():
|
||||
chunks.append({
|
||||
"content": current_item.strip(),
|
||||
"start_char": current_start,
|
||||
"end_char": current_start + len(current_item)
|
||||
})
|
||||
current_item = line
|
||||
current_start = sum(len(lines[j]) + 1 for j in range(i))
|
||||
else:
|
||||
current_item += "\n" + line
|
||||
|
||||
if current_item.strip():
|
||||
chunks.append({
|
||||
"content": current_item.strip(),
|
||||
"start_char": current_start,
|
||||
"end_char": current_start + len(current_item)
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_document(text: str, source_code: str) -> List[Dict]:
|
||||
"""Chunk document using appropriate strategy for source type."""
|
||||
config = DSFA_CHUNK_CONFIG.get(source_code, DSFA_CHUNK_CONFIG["DEFAULT"])
|
||||
|
||||
if source_code.endswith("_MUSS_PUBLIC") or source_code.endswith("_MUSS_PRIVATE"):
|
||||
config = DSFA_CHUNK_CONFIG["MUSS_LISTEN"]
|
||||
|
||||
if config["strategy"] == "section_based":
|
||||
return chunk_by_sections(
|
||||
text,
|
||||
config["section_markers"],
|
||||
config["max_chunk_size"],
|
||||
config["overlap"]
|
||||
)
|
||||
elif config["strategy"] == "list_item":
|
||||
return chunk_by_list_items(
|
||||
text,
|
||||
config["list_markers"],
|
||||
config["max_chunk_size"]
|
||||
)
|
||||
else:
|
||||
return chunk_text_recursive(
|
||||
text,
|
||||
config["max_chunk_size"],
|
||||
config["overlap"]
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
239
klausur-service/backend/dsfa_corpus_store.py
Normal file
239
klausur-service/backend/dsfa_corpus_store.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""
|
||||
DSFA Corpus Store — Database operations and data classes.
|
||||
|
||||
Contains:
|
||||
- DSFAChunkPayload: Qdrant point payload schema
|
||||
- DSFASearchResult: Search result with attribution
|
||||
- DSFACorpusStore: PostgreSQL operations for DSFA corpus
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import uuid
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
import asyncpg
|
||||
|
||||
from dsfa_sources_registry import LICENSE_REGISTRY
|
||||
|
||||
|
||||
@dataclass
|
||||
class DSFAChunkPayload:
|
||||
"""Payload schema for Qdrant points."""
|
||||
chunk_id: str
|
||||
document_id: str
|
||||
source_id: str
|
||||
content: str
|
||||
section_title: Optional[str] = None
|
||||
source_code: str = ""
|
||||
source_name: str = ""
|
||||
attribution_text: str = ""
|
||||
license_code: str = ""
|
||||
attribution_required: bool = True
|
||||
document_type: str = ""
|
||||
category: str = ""
|
||||
language: str = "de"
|
||||
page_number: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class DSFASearchResult:
|
||||
"""Search result with attribution."""
|
||||
chunk_id: str
|
||||
content: str
|
||||
score: float
|
||||
source_code: str
|
||||
source_name: str
|
||||
attribution_text: str
|
||||
license_code: str
|
||||
license_url: Optional[str]
|
||||
attribution_required: bool
|
||||
source_url: Optional[str]
|
||||
document_type: str
|
||||
category: str
|
||||
section_title: Optional[str]
|
||||
page_number: Optional[int]
|
||||
|
||||
|
||||
class DSFACorpusStore:
|
||||
"""Database operations for DSFA corpus."""
|
||||
|
||||
def __init__(self, pool: asyncpg.Pool):
|
||||
self.pool = pool
|
||||
|
||||
async def register_source(self, source_data: Dict) -> str:
|
||||
"""Register a DSFA source in the database."""
|
||||
async with self.pool.acquire() as conn:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT id FROM dsfa_sources WHERE source_code = $1",
|
||||
source_data["source_code"]
|
||||
)
|
||||
if existing:
|
||||
await conn.execute("""
|
||||
UPDATE dsfa_sources SET
|
||||
name = $2,
|
||||
full_name = $3,
|
||||
organization = $4,
|
||||
source_url = $5,
|
||||
eur_lex_celex = $6,
|
||||
license_code = $7,
|
||||
license_url = $8,
|
||||
attribution_required = $9,
|
||||
attribution_text = $10,
|
||||
document_type = $11,
|
||||
language = $12,
|
||||
updated_at = NOW()
|
||||
WHERE source_code = $1
|
||||
""",
|
||||
source_data["source_code"],
|
||||
source_data["name"],
|
||||
source_data.get("full_name"),
|
||||
source_data.get("organization"),
|
||||
source_data.get("source_url"),
|
||||
source_data.get("eur_lex_celex"),
|
||||
source_data["license_code"],
|
||||
source_data.get("license_url"),
|
||||
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
|
||||
source_data["attribution_text"],
|
||||
source_data.get("document_type"),
|
||||
source_data.get("language", "de")
|
||||
)
|
||||
return str(existing)
|
||||
else:
|
||||
source_id = await conn.fetchval("""
|
||||
INSERT INTO dsfa_sources (
|
||||
source_code, name, full_name, organization, source_url,
|
||||
eur_lex_celex, license_code, license_url, attribution_required,
|
||||
attribution_text, document_type, language
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
||||
RETURNING id
|
||||
""",
|
||||
source_data["source_code"],
|
||||
source_data["name"],
|
||||
source_data.get("full_name"),
|
||||
source_data.get("organization"),
|
||||
source_data.get("source_url"),
|
||||
source_data.get("eur_lex_celex"),
|
||||
source_data["license_code"],
|
||||
source_data.get("license_url"),
|
||||
LICENSE_REGISTRY.get(source_data["license_code"], {}).get("attribution_required", True),
|
||||
source_data["attribution_text"],
|
||||
source_data.get("document_type"),
|
||||
source_data.get("language", "de")
|
||||
)
|
||||
return str(source_id)
|
||||
|
||||
async def get_source_by_code(self, source_code: str) -> Optional[Dict]:
|
||||
"""Get source by its code."""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"SELECT * FROM dsfa_sources WHERE source_code = $1",
|
||||
source_code
|
||||
)
|
||||
if row:
|
||||
return dict(row)
|
||||
return None
|
||||
|
||||
async def list_sources(self) -> List[Dict]:
|
||||
"""List all registered sources."""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"SELECT * FROM dsfa_sources ORDER BY source_code"
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def create_document(
|
||||
self,
|
||||
source_id: str,
|
||||
title: str,
|
||||
file_name: Optional[str] = None,
|
||||
file_type: Optional[str] = None,
|
||||
minio_path: Optional[str] = None,
|
||||
original_url: Optional[str] = None,
|
||||
metadata: Optional[Dict] = None
|
||||
) -> str:
|
||||
"""Create a document record."""
|
||||
import json
|
||||
metadata_json = json.dumps(metadata or {})
|
||||
async with self.pool.acquire() as conn:
|
||||
doc_id = await conn.fetchval("""
|
||||
INSERT INTO dsfa_documents (
|
||||
source_id, title, file_name, file_type, minio_path,
|
||||
original_url, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb)
|
||||
RETURNING id
|
||||
""",
|
||||
uuid.UUID(source_id),
|
||||
title,
|
||||
file_name,
|
||||
file_type,
|
||||
minio_path,
|
||||
original_url,
|
||||
metadata_json
|
||||
)
|
||||
return str(doc_id)
|
||||
|
||||
async def create_chunk(
|
||||
self,
|
||||
document_id: str,
|
||||
source_id: str,
|
||||
content: str,
|
||||
chunk_index: int,
|
||||
section_title: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
qdrant_point_id: Optional[str] = None,
|
||||
metadata: Optional[Dict] = None
|
||||
) -> str:
|
||||
"""Create a chunk record."""
|
||||
import json
|
||||
content_hash = hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
chunk_id = await conn.fetchval("""
|
||||
INSERT INTO dsfa_document_chunks (
|
||||
document_id, source_id, content, content_hash, chunk_index,
|
||||
section_title, page_number, category, qdrant_point_id, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::jsonb)
|
||||
RETURNING id
|
||||
""",
|
||||
uuid.UUID(document_id),
|
||||
uuid.UUID(source_id),
|
||||
content,
|
||||
content_hash,
|
||||
chunk_index,
|
||||
section_title,
|
||||
page_number,
|
||||
category,
|
||||
qdrant_point_id,
|
||||
json.dumps(metadata or {})
|
||||
)
|
||||
return str(chunk_id)
|
||||
|
||||
async def get_chunk_with_attribution(self, chunk_id: str) -> Optional[Dict]:
|
||||
"""Get a chunk with full source attribution."""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT * FROM dsfa_chunk_with_attribution
|
||||
WHERE chunk_id = $1
|
||||
""", uuid.UUID(chunk_id))
|
||||
if row:
|
||||
return dict(row)
|
||||
return None
|
||||
|
||||
async def get_source_stats(self) -> List[Dict]:
|
||||
"""Get aggregated stats per source."""
|
||||
async with self.pool.acquire() as conn:
|
||||
rows = await conn.fetch("SELECT * FROM dsfa_source_stats")
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def update_document_indexed(self, document_id: str, chunks_count: int):
|
||||
"""Update document with indexing information."""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE dsfa_documents
|
||||
SET chunks_generated = $2,
|
||||
last_indexed_at = NOW(),
|
||||
text_extracted = true
|
||||
WHERE id = $1
|
||||
""", uuid.UUID(document_id), chunks_count)
|
||||
157
klausur-service/backend/dsfa_qdrant_service.py
Normal file
157
klausur-service/backend/dsfa_qdrant_service.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
DSFA Qdrant Service — Vector store operations.
|
||||
|
||||
Contains:
|
||||
- DSFAQdrantService: Qdrant client wrapper for DSFA corpus
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import asdict
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import (
|
||||
VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue
|
||||
)
|
||||
|
||||
from dsfa_corpus_store import DSFAChunkPayload
|
||||
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
|
||||
DSFA_COLLECTION = "bp_dsfa_corpus"
|
||||
VECTOR_SIZE = 1024 # BGE-M3
|
||||
|
||||
|
||||
class DSFAQdrantService:
|
||||
"""Qdrant operations for DSFA corpus."""
|
||||
|
||||
def __init__(self, url: Optional[str] = None):
|
||||
self.url = url or QDRANT_URL
|
||||
self._client = None
|
||||
|
||||
@property
|
||||
def client(self) -> QdrantClient:
|
||||
if self._client is None:
|
||||
self._client = QdrantClient(url=self.url, check_compatibility=False)
|
||||
return self._client
|
||||
|
||||
async def ensure_collection(self) -> bool:
|
||||
"""Ensure DSFA collection exists."""
|
||||
try:
|
||||
collections = self.client.get_collections().collections
|
||||
collection_names = [c.name for c in collections]
|
||||
|
||||
if DSFA_COLLECTION not in collection_names:
|
||||
self.client.create_collection(
|
||||
collection_name=DSFA_COLLECTION,
|
||||
vectors_config=VectorParams(
|
||||
size=VECTOR_SIZE,
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
print(f"Created collection: {DSFA_COLLECTION}")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error ensuring collection: {e}")
|
||||
return False
|
||||
|
||||
async def index_chunks(
|
||||
self,
|
||||
chunks: List[Dict],
|
||||
embeddings: List[List[float]]
|
||||
) -> int:
|
||||
"""Index chunks into Qdrant."""
|
||||
if not chunks or not embeddings:
|
||||
return 0
|
||||
|
||||
points = []
|
||||
for chunk, embedding in zip(chunks, embeddings):
|
||||
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk["chunk_id"]))
|
||||
|
||||
payload = DSFAChunkPayload(
|
||||
chunk_id=chunk["chunk_id"],
|
||||
document_id=chunk["document_id"],
|
||||
source_id=chunk["source_id"],
|
||||
content=chunk["content"],
|
||||
section_title=chunk.get("section_title"),
|
||||
source_code=chunk["source_code"],
|
||||
source_name=chunk["source_name"],
|
||||
attribution_text=chunk["attribution_text"],
|
||||
license_code=chunk["license_code"],
|
||||
attribution_required=chunk.get("attribution_required", True),
|
||||
document_type=chunk.get("document_type", ""),
|
||||
category=chunk.get("category", ""),
|
||||
language=chunk.get("language", "de"),
|
||||
page_number=chunk.get("page_number")
|
||||
)
|
||||
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload=asdict(payload)
|
||||
)
|
||||
)
|
||||
|
||||
self.client.upsert(collection_name=DSFA_COLLECTION, points=points)
|
||||
return len(points)
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
source_codes: Optional[List[str]] = None,
|
||||
document_types: Optional[List[str]] = None,
|
||||
categories: Optional[List[str]] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict]:
|
||||
"""Search DSFA corpus with filters."""
|
||||
must_conditions = []
|
||||
|
||||
if source_codes:
|
||||
for code in source_codes:
|
||||
must_conditions.append(
|
||||
FieldCondition(key="source_code", match=MatchValue(value=code))
|
||||
)
|
||||
|
||||
if document_types:
|
||||
for dtype in document_types:
|
||||
must_conditions.append(
|
||||
FieldCondition(key="document_type", match=MatchValue(value=dtype))
|
||||
)
|
||||
|
||||
if categories:
|
||||
for cat in categories:
|
||||
must_conditions.append(
|
||||
FieldCondition(key="category", match=MatchValue(value=cat))
|
||||
)
|
||||
|
||||
query_filter = Filter(must=must_conditions) if must_conditions else None
|
||||
|
||||
results = self.client.query_points(
|
||||
collection_name=DSFA_COLLECTION,
|
||||
query=query_embedding,
|
||||
query_filter=query_filter,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"id": str(r.id),
|
||||
"score": r.score,
|
||||
**r.payload
|
||||
}
|
||||
for r in results.points
|
||||
]
|
||||
|
||||
async def get_stats(self) -> Dict:
|
||||
"""Get collection statistics."""
|
||||
try:
|
||||
info = self.client.get_collection(DSFA_COLLECTION)
|
||||
return {
|
||||
"collection": DSFA_COLLECTION,
|
||||
"vectors_count": info.vectors_count,
|
||||
"points_count": info.points_count,
|
||||
"status": info.status.value
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "collection": DSFA_COLLECTION}
|
||||
1140
klausur-service/backend/dsfa_sources_registry.py
Normal file
1140
klausur-service/backend/dsfa_sources_registry.py
Normal file
File diff suppressed because it is too large
Load Diff
305
klausur-service/backend/grid_build_cell_ops.py
Normal file
305
klausur-service/backend/grid_build_cell_ops.py
Normal file
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
Grid Build Cell Ops — Cell-level operations: bullet/artifact removal,
|
||||
garbled cell cleanup, word-box reordering, and max_columns enforcement.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from cv_ocr_engines import (
|
||||
_words_to_reading_order_text, _group_words_into_lines, _lookup_ipa,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _remove_bullets_and_artifacts(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove blue bullet/artifact word_boxes (Step 5i).
|
||||
|
||||
Handles tiny coloured symbols, overlapping word_boxes, duplicate text,
|
||||
and syllable-split word merging.
|
||||
"""
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
_REMOVE_SYMBOLS = {'>', '<', '~', '\\', '^', '`', '#', '|', '¬', '¦'}
|
||||
|
||||
bullet_removed = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
to_remove: set = set()
|
||||
|
||||
# Rule (a): tiny coloured symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
cn = wb.get("color_name", "black")
|
||||
if (cn != "black"
|
||||
and wb.get("width", 0) * wb.get("height", 0) < 200
|
||||
and wb.get("conf", 100) < 85):
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (a2): isolated non-alphanumeric symbols
|
||||
for i, wb in enumerate(wbs):
|
||||
t = (wb.get("text") or "").strip()
|
||||
if t and not re.search(r'[a-zA-Z0-9äöüÄÖÜß]', t) and len(t) <= 2:
|
||||
if t in _REMOVE_SYMBOLS:
|
||||
to_remove.add(i)
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
to_merge: List[Tuple[int, int]] = []
|
||||
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
||||
for p in range(len(indexed) - 1):
|
||||
i1, w1 = indexed[p]
|
||||
i2, w2 = indexed[p + 1]
|
||||
x1s, x1e = w1.get("left", 0), w1.get("left", 0) + w1.get("width", 0)
|
||||
x2s, x2e = w2.get("left", 0), w2.get("left", 0) + w2.get("width", 0)
|
||||
overlap = max(0, min(x1e, x2e) - max(x1s, x2s))
|
||||
min_w = min(w1.get("width", 1), w2.get("width", 1))
|
||||
gap = x2s - x1e
|
||||
overlap_pct = overlap / min_w if min_w > 0 else 0
|
||||
|
||||
if overlap_pct > 0.20:
|
||||
t1 = (w1.get("text") or "").strip()
|
||||
t2 = (w2.get("text") or "").strip()
|
||||
|
||||
# Syllable-split words
|
||||
if (overlap_pct <= 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
# High overlap with short prefix
|
||||
if (overlap_pct > 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)
|
||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue
|
||||
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
|
||||
# Very high overlap: prefer IPA-dictionary word
|
||||
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
||||
if in_dict_1 and not in_dict_2:
|
||||
to_remove.add(i2)
|
||||
continue
|
||||
elif in_dict_2 and not in_dict_1:
|
||||
to_remove.add(i1)
|
||||
continue
|
||||
|
||||
if c1 < c2:
|
||||
to_remove.add(i1)
|
||||
elif c2 < c1:
|
||||
to_remove.add(i2)
|
||||
else:
|
||||
if w1.get("height", 0) > w2.get("height", 0):
|
||||
to_remove.add(i1)
|
||||
else:
|
||||
to_remove.add(i2)
|
||||
|
||||
elif (gap < 6
|
||||
and w1.get("color_name") == "blue"
|
||||
and w2.get("color_name") == "blue"
|
||||
and (w1.get("text") or "").strip() == (w2.get("text") or "").strip()):
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
if to_merge:
|
||||
merge_parent: Dict[int, int] = {}
|
||||
for mi1, mi2 in to_merge:
|
||||
actual_mi1 = mi1
|
||||
while actual_mi1 in merge_parent:
|
||||
actual_mi1 = merge_parent[actual_mi1]
|
||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||
continue
|
||||
if mi2 in merge_parent:
|
||||
continue
|
||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
merged_text = mt1 + mt2
|
||||
mx = min(mw1["left"], mw2["left"])
|
||||
my = min(mw1["top"], mw2["top"])
|
||||
mr = max(mw1["left"] + mw1["width"],
|
||||
mw2["left"] + mw2["width"])
|
||||
mb = max(mw1["top"] + mw1["height"],
|
||||
mw2["top"] + mw2["height"])
|
||||
mw1["text"] = merged_text
|
||||
mw1["left"] = mx
|
||||
mw1["top"] = my
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2)
|
||||
merge_parent[mi2] = actual_mi1
|
||||
bullet_removed -= 1
|
||||
|
||||
if to_remove:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
cell["word_boxes"] = filtered
|
||||
if not cell.get("_ipa_corrected"):
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
|
||||
if bullet_removed:
|
||||
for z in zones_data:
|
||||
z["cells"] = [c for c in z.get("cells", [])
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info("Step 5i: removed %d bullet/artifact word_boxes", bullet_removed)
|
||||
|
||||
|
||||
def _remove_garbled_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove cells whose text is entirely garbled / artifact noise (Step 5j-pre)."""
|
||||
_COMMON_SHORT_WORDS = {
|
||||
"ab", "am", "an", "da", "du", "er", "es", "im", "in", "ja",
|
||||
"ob", "so", "um", "zu", "wo", "je", "oh", "or",
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "und",
|
||||
"auf", "aus", "bei", "bis", "für", "mit", "nur", "von",
|
||||
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he",
|
||||
"if", "in", "is", "it", "me", "my", "no", "of", "oh", "ok",
|
||||
"on", "or", "so", "to", "up", "us", "we",
|
||||
"the", "and", "but", "for", "not",
|
||||
}
|
||||
_PURE_JUNK_RE = re.compile(r'^[\W\d\s]+$')
|
||||
artifact_cells_removed = 0
|
||||
|
||||
for z in zones_data:
|
||||
before = len(z.get("cells", []))
|
||||
kept = []
|
||||
for cell in z.get("cells", []):
|
||||
text = (cell.get("text") or "").strip()
|
||||
core = text.rstrip(".,;:!?'\"")
|
||||
is_artifact = False
|
||||
if not core:
|
||||
is_artifact = True
|
||||
elif _PURE_JUNK_RE.match(core):
|
||||
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
||||
is_artifact = True
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||
is_artifact = True
|
||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
||||
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
||||
is_artifact = True
|
||||
if is_artifact:
|
||||
kept.append(None)
|
||||
else:
|
||||
kept.append(cell)
|
||||
z["cells"] = [c for c in kept if c is not None]
|
||||
artifact_cells_removed += before - len(z["cells"])
|
||||
|
||||
if artifact_cells_removed:
|
||||
for z in zones_data:
|
||||
cell_ris = {c.get("row_index") for c in z.get("cells", [])}
|
||||
z["rows"] = [r for r in z.get("rows", []) if r["index"] in cell_ris]
|
||||
logger.info("Step 5j-pre: removed %d artifact cells", artifact_cells_removed)
|
||||
|
||||
|
||||
def _normalize_word_order(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Normalise word_box order to reading order (Step 5j)."""
|
||||
wb_reordered = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) < 2:
|
||||
continue
|
||||
lines = _group_words_into_lines(wbs, y_tolerance_px=15)
|
||||
sorted_wbs = [w for line in lines for w in line]
|
||||
if [id(w) for w in sorted_wbs] != [id(w) for w in wbs]:
|
||||
cell["word_boxes"] = sorted_wbs
|
||||
wb_reordered += 1
|
||||
if wb_reordered:
|
||||
logger.info("Step 5j: re-ordered word_boxes in %d cells to reading order", wb_reordered)
|
||||
|
||||
|
||||
def _enforce_max_columns(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
max_columns: int,
|
||||
) -> None:
|
||||
"""Enforce max_columns by merging narrowest columns (Step 5k)."""
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if len(cols) <= max_columns:
|
||||
continue
|
||||
|
||||
logger.info(
|
||||
"max_columns=%d: zone %s has %d columns -> merging",
|
||||
max_columns, z.get("zone_index"), len(cols),
|
||||
)
|
||||
|
||||
cols_by_width = sorted(cols, key=lambda c: (c.get("x_max_px", 0) - c.get("x_min_px", 0)))
|
||||
|
||||
while len(cols) > max_columns:
|
||||
narrowest = cols_by_width.pop(0)
|
||||
ni = narrowest["index"]
|
||||
|
||||
sorted_by_x = sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))
|
||||
pos = next(i for i, c in enumerate(sorted_by_x) if c["index"] == ni)
|
||||
if pos + 1 < len(sorted_by_x):
|
||||
merge_target = sorted_by_x[pos + 1]
|
||||
elif pos > 0:
|
||||
merge_target = sorted_by_x[pos - 1]
|
||||
else:
|
||||
break
|
||||
|
||||
ti = merge_target["index"]
|
||||
|
||||
merge_target["x_min_px"] = min(
|
||||
merge_target.get("x_min_px", merge_target.get("x_min_pct", 0)),
|
||||
narrowest.get("x_min_px", narrowest.get("x_min_pct", 0)),
|
||||
)
|
||||
merge_target["x_max_px"] = max(
|
||||
merge_target.get("x_max_px", merge_target.get("x_max_pct", 100)),
|
||||
narrowest.get("x_max_px", narrowest.get("x_max_pct", 100)),
|
||||
)
|
||||
if "x_min_pct" in merge_target and "x_min_pct" in narrowest:
|
||||
merge_target["x_min_pct"] = min(merge_target["x_min_pct"], narrowest["x_min_pct"])
|
||||
merge_target["x_max_pct"] = max(merge_target["x_max_pct"], narrowest["x_max_pct"])
|
||||
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == ni:
|
||||
cell["col_index"] = ti
|
||||
existing = next(
|
||||
(c for c in cells if c["col_index"] == ti
|
||||
and c["row_index"] == cell["row_index"]
|
||||
and c is not cell),
|
||||
None,
|
||||
)
|
||||
if existing:
|
||||
existing["text"] = (
|
||||
(existing.get("text", "") + " " + cell.get("text", "")).strip()
|
||||
)
|
||||
existing["word_boxes"] = existing.get("word_boxes", []) + cell.get("word_boxes", [])
|
||||
cell["_merged"] = True
|
||||
|
||||
z["cells"] = [c for c in cells if not c.get("_merged")]
|
||||
cells = z["cells"]
|
||||
cols.remove(narrowest)
|
||||
cols_by_width = [c for c in cols_by_width if c["index"] != ni]
|
||||
|
||||
# Re-index columns 0..N-1
|
||||
for new_idx, col in enumerate(sorted(cols, key=lambda c: c.get("x_min_px", c.get("x_min_pct", 0)))):
|
||||
old_idx = col["index"]
|
||||
col["index"] = new_idx
|
||||
for cell in cells:
|
||||
if cell.get("col_index") == old_idx:
|
||||
cell["col_index"] = new_idx
|
||||
|
||||
logger.info("max_columns: zone %s now has %d columns", z.get("zone_index"), len(cols))
|
||||
390
klausur-service/backend/grid_build_cleanup.py
Normal file
390
klausur-service/backend/grid_build_cleanup.py
Normal file
@@ -0,0 +1,390 @@
|
||||
"""
|
||||
Grid Build Cleanup — Phase 3: Junk row removal, artifact cleanup, pipe
|
||||
divider removal, connector normalization, border strip detection, and
|
||||
alphabet sidebar removal.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from cv_ocr_engines import _words_to_reading_order_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PIPE_RE = re.compile(r"^\|+$")
|
||||
|
||||
|
||||
def _cleanup_zones(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
border_prefiltered: bool,
|
||||
session_id: str,
|
||||
) -> bool:
|
||||
"""Clean up zone data: remove junk rows, artifacts, pipes, border strips.
|
||||
|
||||
Args:
|
||||
zones_data: List of zone dicts (modified in place).
|
||||
border_prefiltered: Whether border words were already pre-filtered.
|
||||
session_id: For logging.
|
||||
|
||||
Returns:
|
||||
Updated border_prefiltered flag.
|
||||
"""
|
||||
_remove_junk_rows(zones_data)
|
||||
_remove_artifact_cells(zones_data)
|
||||
_remove_oversized_word_boxes(zones_data)
|
||||
_remove_pipe_dividers(zones_data)
|
||||
_normalize_connector_columns(zones_data)
|
||||
border_prefiltered = _remove_border_strips(zones_data, border_prefiltered)
|
||||
_remove_alphabet_sidebars(zones_data)
|
||||
return border_prefiltered
|
||||
|
||||
|
||||
def _remove_junk_rows(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove rows where ALL cells contain only short, low-confidence text.
|
||||
|
||||
Also removes 'oversized stub' rows and 'scattered debris' rows.
|
||||
"""
|
||||
_JUNK_CONF_THRESHOLD = 50
|
||||
_JUNK_MAX_TEXT_LEN = 3
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not cells or not rows:
|
||||
continue
|
||||
|
||||
# Compute median word height across the zone for oversized detection
|
||||
all_wb_heights = [
|
||||
wb["height"]
|
||||
for cell in cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
if wb.get("height", 0) > 0
|
||||
]
|
||||
median_wb_h = sorted(all_wb_heights)[len(all_wb_heights) // 2] if all_wb_heights else 28
|
||||
|
||||
junk_row_indices = set()
|
||||
for row in rows:
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in cells if c.get("row_index") == ri]
|
||||
if not row_cells:
|
||||
continue
|
||||
|
||||
row_wbs = [
|
||||
wb for cell in row_cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
]
|
||||
|
||||
# Rule 1: ALL word_boxes are low-conf AND short text
|
||||
all_junk = True
|
||||
for wb in row_wbs:
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 0)
|
||||
if conf >= _JUNK_CONF_THRESHOLD or len(text) > _JUNK_MAX_TEXT_LEN:
|
||||
all_junk = False
|
||||
break
|
||||
if all_junk and row_wbs:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
# Rule 2: oversized stub -- <=3 words, short total text,
|
||||
# and word height > 1.8x median
|
||||
if len(row_wbs) <= 3:
|
||||
total_text = "".join((wb.get("text") or "").strip() for wb in row_wbs)
|
||||
max_h = max((wb.get("height", 0) for wb in row_wbs), default=0)
|
||||
has_page_ref = any(
|
||||
re.match(r'^[pPsS]\.?\s*\d+$', (wb.get("text") or "").strip())
|
||||
for wb in row_wbs
|
||||
)
|
||||
if len(total_text) <= 5 and max_h > median_wb_h * 1.8 and not has_page_ref:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
# Rule 3: scattered debris -- rows with only tiny fragments
|
||||
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
||||
if longest <= 2:
|
||||
junk_row_indices.add(ri)
|
||||
continue
|
||||
|
||||
if junk_row_indices:
|
||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||
logger.info(
|
||||
"build-grid: removed %d junk rows from zone %d: %s",
|
||||
len(junk_row_indices), z["zone_index"],
|
||||
sorted(junk_row_indices),
|
||||
)
|
||||
|
||||
|
||||
def _remove_artifact_cells(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove individual cells with a single very-short, low-conf word."""
|
||||
_ARTIFACT_MAX_LEN = 2
|
||||
_ARTIFACT_CONF_THRESHOLD = 65
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
artifact_ids = set()
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
if len(wbs) != 1:
|
||||
continue
|
||||
wb = wbs[0]
|
||||
text = (wb.get("text") or "").strip()
|
||||
conf = wb.get("conf", 100)
|
||||
if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
|
||||
artifact_ids.add(cell.get("cell_id"))
|
||||
if artifact_ids:
|
||||
z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
|
||||
logger.info(
|
||||
"build-grid: removed %d artifact cells from zone %d: %s",
|
||||
len(artifact_ids), z.get("zone_index", 0),
|
||||
[c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
|
||||
)
|
||||
|
||||
|
||||
def _remove_oversized_word_boxes(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove word_boxes whose height is 3x+ the median (graphic artifacts)."""
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
all_wh = [
|
||||
wb["height"]
|
||||
for cell in cells
|
||||
for wb in cell.get("word_boxes") or []
|
||||
if wb.get("height", 0) > 0
|
||||
]
|
||||
if not all_wh:
|
||||
continue
|
||||
med_h = sorted(all_wh)[len(all_wh) // 2]
|
||||
oversized_threshold = med_h * 3
|
||||
removed_oversized = 0
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if wb.get("height", 0) <= oversized_threshold]
|
||||
if len(filtered) < len(wbs):
|
||||
removed_oversized += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
if removed_oversized:
|
||||
z["cells"] = [c for c in cells if c.get("word_boxes")]
|
||||
logger.info(
|
||||
"build-grid: removed %d oversized word_boxes (>%dpx) from zone %d",
|
||||
removed_oversized, oversized_threshold, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
|
||||
def _remove_pipe_dividers(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove pipe-character word_boxes (column divider artifacts)."""
|
||||
for z in zones_data:
|
||||
if z.get("vsplit_group") is not None:
|
||||
continue # pipes already removed before split
|
||||
removed_pipes = 0
|
||||
for cell in z.get("cells", []):
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if not _PIPE_RE.match((wb.get("text") or "").strip())]
|
||||
if len(filtered) < len(wbs):
|
||||
removed_pipes += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
if removed_pipes:
|
||||
z["cells"] = [c for c in z.get("cells", []) if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"build-grid: removed %d pipe-divider word_boxes from zone %d",
|
||||
removed_pipes, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
# Strip pipe chars ONLY from cell edges (OCR artifacts).
|
||||
# Preserve pipes embedded in words as syllable separators.
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if "|" in text:
|
||||
cleaned = text.strip("|").strip()
|
||||
if cleaned != text.strip():
|
||||
cell["text"] = cleaned
|
||||
|
||||
|
||||
def _normalize_connector_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Normalize narrow connector columns where OCR appends noise chars.
|
||||
|
||||
In synonym dictionaries a narrow column repeats the same word
|
||||
(e.g. "oder") in every row. OCR sometimes appends noise chars.
|
||||
"""
|
||||
for z in zones_data:
|
||||
cols = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if not cols or not cells:
|
||||
continue
|
||||
for col in cols:
|
||||
ci = col.get("index")
|
||||
col_cells = [c for c in cells if c.get("col_index") == ci]
|
||||
if len(col_cells) < 3:
|
||||
continue
|
||||
text_counts: Dict[str, int] = {}
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t:
|
||||
text_counts[t] = text_counts.get(t, 0) + 1
|
||||
if not text_counts:
|
||||
continue
|
||||
dominant_text = max(text_counts, key=text_counts.get) # type: ignore[arg-type]
|
||||
dominant_count = text_counts[dominant_text]
|
||||
if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
|
||||
continue
|
||||
fixed = 0
|
||||
for c in col_cells:
|
||||
t = (c.get("text") or "").strip()
|
||||
if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
|
||||
c["text"] = dominant_text
|
||||
wbs = c.get("word_boxes") or []
|
||||
if len(wbs) == 1:
|
||||
wbs[0]["text"] = dominant_text
|
||||
fixed += 1
|
||||
if fixed:
|
||||
logger.info(
|
||||
"build-grid: normalized %d outlier cells in connector column %d "
|
||||
"(dominant='%s') zone %d",
|
||||
fixed, ci, dominant_text, z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
|
||||
def _remove_border_strips(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
border_prefiltered: bool,
|
||||
) -> bool:
|
||||
"""Detect and remove page-border decoration strips.
|
||||
|
||||
Returns updated border_prefiltered flag.
|
||||
"""
|
||||
border_strip_removed = 0
|
||||
if border_prefiltered:
|
||||
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
||||
return border_prefiltered
|
||||
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
all_wbs_with_cell: list = []
|
||||
for cell in cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
||||
if len(all_wbs_with_cell) < 10:
|
||||
continue
|
||||
all_wbs_with_cell.sort(key=lambda t: t[0])
|
||||
total = len(all_wbs_with_cell)
|
||||
|
||||
# -- Left-edge scan --
|
||||
left_strip_count = 0
|
||||
left_gap = 0
|
||||
running_right = 0
|
||||
for gi in range(total - 1):
|
||||
running_right = max(
|
||||
running_right,
|
||||
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
||||
)
|
||||
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
||||
if gap > 30:
|
||||
left_strip_count = gi + 1
|
||||
left_gap = gap
|
||||
break
|
||||
|
||||
# -- Right-edge scan --
|
||||
right_strip_count = 0
|
||||
right_gap = 0
|
||||
running_left = all_wbs_with_cell[-1][0]
|
||||
for gi in range(total - 1, 0, -1):
|
||||
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
||||
prev_right = (
|
||||
all_wbs_with_cell[gi - 1][0]
|
||||
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
||||
)
|
||||
gap = running_left - prev_right
|
||||
if gap > 30:
|
||||
right_strip_count = total - gi
|
||||
right_gap = gap
|
||||
break
|
||||
|
||||
strip_wbs: set = set()
|
||||
strip_side = ""
|
||||
strip_gap = 0
|
||||
strip_count = 0
|
||||
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
||||
strip_side = "left"
|
||||
strip_count = left_strip_count
|
||||
strip_gap = left_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
||||
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
||||
strip_side = "right"
|
||||
strip_count = right_strip_count
|
||||
strip_gap = right_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
||||
|
||||
if not strip_wbs:
|
||||
continue
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
||||
if len(filtered) < len(wbs):
|
||||
border_strip_removed += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
z["cells"] = [c for c in cells
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
||||
"(gap=%dpx, strip=%d/%d wbs)",
|
||||
border_strip_removed, strip_side, z.get("zone_index", 0),
|
||||
strip_gap, strip_count, total,
|
||||
)
|
||||
|
||||
return border_prefiltered
|
||||
|
||||
|
||||
def _remove_alphabet_sidebars(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove decorative edge columns (alphabet sidebar safety net).
|
||||
|
||||
Dictionary pages have A-Z letter sidebars that OCR reads as single-
|
||||
character word_boxes.
|
||||
"""
|
||||
for z in zones_data:
|
||||
columns = z.get("columns", [])
|
||||
cells = z.get("cells", [])
|
||||
if len(columns) < 3 or not cells:
|
||||
continue
|
||||
col_cells: Dict[str, List[Dict]] = {}
|
||||
for cell in cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_"):
|
||||
col_cells.setdefault(ct, []).append(cell)
|
||||
col_types_ordered = sorted(col_cells.keys())
|
||||
if len(col_types_ordered) < 3:
|
||||
continue
|
||||
for edge_ct in [col_types_ordered[0], col_types_ordered[-1]]:
|
||||
edge_cells_list = col_cells.get(edge_ct, [])
|
||||
if len(edge_cells_list) < 3:
|
||||
continue
|
||||
texts = [(c.get("text") or "").strip() for c in edge_cells_list]
|
||||
avg_len = sum(len(t) for t in texts) / len(texts)
|
||||
single_char = sum(1 for t in texts if len(t) <= 1)
|
||||
single_ratio = single_char / len(texts)
|
||||
if avg_len > 1.5:
|
||||
continue
|
||||
if single_ratio < 0.7:
|
||||
continue
|
||||
removed_count = len(edge_cells_list)
|
||||
edge_ids = {id(c) for c in edge_cells_list}
|
||||
z["cells"] = [c for c in cells if id(c) not in edge_ids]
|
||||
z["columns"] = [col for col in columns if col.get("col_type") != edge_ct]
|
||||
logger.info(
|
||||
"Step 4f: removed decorative edge column '%s' from zone %d "
|
||||
"(%d cells, avg_len=%.1f, single_char=%.0f%%)",
|
||||
edge_ct, z.get("zone_index", 0), removed_count,
|
||||
avg_len, single_ratio * 100,
|
||||
)
|
||||
break # only remove one edge per zone
|
||||
File diff suppressed because it is too large
Load Diff
452
klausur-service/backend/grid_build_finalize.py
Normal file
452
klausur-service/backend/grid_build_finalize.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""
|
||||
Grid Build Finalize — Phase 5b+6: Orchestrates cell-level operations,
|
||||
dictionary detection, syllable dividers, spell checking, empty column
|
||||
removal, and result assembly.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from grid_build_cell_ops import (
|
||||
_remove_bullets_and_artifacts,
|
||||
_remove_garbled_cells,
|
||||
_normalize_word_order,
|
||||
_enforce_max_columns,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _finalize_grid(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
max_columns: Optional[int],
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
en_col_type: Optional[str],
|
||||
ipa_target_cols: set,
|
||||
all_content_cols: set,
|
||||
skip_ipa: bool,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
) -> dict:
|
||||
"""Run final processing steps and assemble result dict.
|
||||
|
||||
Handles: bullet removal, artifact cells, word ordering, max_columns,
|
||||
dictionary detection, syllable dividers, spell check, empty columns,
|
||||
internal flag cleanup, and result assembly.
|
||||
"""
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
|
||||
# 5i. Remove blue bullet/artifact word_boxes
|
||||
_remove_bullets_and_artifacts(zones_data)
|
||||
|
||||
# 5j-pre. Remove cells whose text is entirely garbled / artifact noise
|
||||
_remove_garbled_cells(zones_data)
|
||||
|
||||
# 5j. Normalise word_box order to reading order
|
||||
_normalize_word_order(zones_data)
|
||||
|
||||
# 5k. Enforce max_columns by merging narrowest columns
|
||||
if max_columns and max_columns > 0:
|
||||
_enforce_max_columns(zones_data, max_columns)
|
||||
|
||||
# --- Dictionary detection on assembled grid ---
|
||||
dict_detection = _detect_dictionary(
|
||||
zones_data, img_w, img_h, document_category, margin_strip_detected
|
||||
)
|
||||
|
||||
# --- Word-gap merge ---
|
||||
try:
|
||||
from cv_syllable_detect import merge_word_gaps_in_zones
|
||||
merge_word_gaps_in_zones(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Word-gap merge failed: %s", e)
|
||||
|
||||
# --- Pipe auto-correction ---
|
||||
try:
|
||||
from cv_syllable_detect import autocorrect_pipe_artifacts
|
||||
autocorrect_pipe_artifacts(zones_data, session_id)
|
||||
except Exception as e:
|
||||
logger.warning("Pipe autocorrect failed: %s", e)
|
||||
|
||||
# --- Syllable divider insertion ---
|
||||
syllable_insertions = _insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id, syllable_mode, dict_detection,
|
||||
en_col_type, all_content_cols, total_cols,
|
||||
)
|
||||
|
||||
# --- Split merged words ---
|
||||
_split_merged_words(zones_data, session_id)
|
||||
|
||||
# --- Ensure space before IPA/phonetic brackets ---
|
||||
_fix_ipa_spacing(zones_data)
|
||||
|
||||
# --- SmartSpellChecker ---
|
||||
_run_spell_checker(zones_data, session_id, en_col_type, total_cols)
|
||||
|
||||
# --- Debug log cell counts per column ---
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") == "content":
|
||||
from collections import Counter as _Counter
|
||||
_cc = _Counter(c.get("col_index") for c in z.get("cells", []))
|
||||
_cols = z.get("columns", [])
|
||||
logger.info(
|
||||
"pre-empty-col-removal zone %d: %d cols, cells_per_col=%s",
|
||||
z.get("zone_index", 0), len(_cols), dict(sorted(_cc.items())),
|
||||
)
|
||||
|
||||
# --- Remove empty columns ---
|
||||
_remove_empty_columns(zones_data)
|
||||
|
||||
# Clean up internal flags before returning
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
cell.pop("_ipa_corrected", None)
|
||||
|
||||
# 6. Build result
|
||||
return _assemble_result(
|
||||
zones_data, all_words, img_w, img_h, session_id,
|
||||
ipa_mode, syllable_mode, ipa_target_cols, skip_ipa,
|
||||
dict_detection, page_number_info, boxes_detected,
|
||||
recovered_count, duration, syllable_insertions,
|
||||
)
|
||||
|
||||
|
||||
def _detect_dictionary(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
document_category: Optional[str],
|
||||
margin_strip_detected: bool,
|
||||
) -> Dict[str, Any]:
|
||||
"""Run dictionary detection on the assembled grid."""
|
||||
from cv_layout import _score_dictionary_signals
|
||||
dict_detection: Dict[str, Any] = {"is_dictionary": False, "confidence": 0.0}
|
||||
try:
|
||||
from cv_vocab_types import ColumnGeometry
|
||||
for z in zones_data:
|
||||
zone_cells = z.get("cells", [])
|
||||
zone_cols = z.get("columns", [])
|
||||
if len(zone_cols) < 2 or len(zone_cells) < 10:
|
||||
continue
|
||||
pseudo_geoms = []
|
||||
for col in zone_cols:
|
||||
ci = col["index"]
|
||||
col_cells = [c for c in zone_cells if c.get("col_index") == ci]
|
||||
col_words = []
|
||||
for cell in col_cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
col_words.append({
|
||||
"text": wb.get("text", ""),
|
||||
"conf": wb.get("conf", 0),
|
||||
"top": wb.get("top", 0),
|
||||
"left": wb.get("left", 0),
|
||||
"height": wb.get("height", 0),
|
||||
"width": wb.get("width", 0),
|
||||
})
|
||||
if not cell.get("word_boxes") and cell.get("text"):
|
||||
col_words.append({
|
||||
"text": cell["text"],
|
||||
"conf": cell.get("confidence", 50),
|
||||
"top": cell.get("bbox_px", {}).get("y", 0),
|
||||
"left": cell.get("bbox_px", {}).get("x", 0),
|
||||
"height": cell.get("bbox_px", {}).get("h", 20),
|
||||
"width": cell.get("bbox_px", {}).get("w", 50),
|
||||
})
|
||||
col_w = col.get("x_max_px", 0) - col.get("x_min_px", 0)
|
||||
pseudo_geoms.append(ColumnGeometry(
|
||||
index=ci, x=col.get("x_min_px", 0), y=0,
|
||||
width=max(col_w, 1), height=img_h,
|
||||
word_count=len(col_words), words=col_words,
|
||||
width_ratio=col_w / max(img_w, 1),
|
||||
))
|
||||
if len(pseudo_geoms) >= 2:
|
||||
dd = _score_dictionary_signals(
|
||||
pseudo_geoms,
|
||||
document_category=document_category,
|
||||
margin_strip_detected=margin_strip_detected,
|
||||
)
|
||||
if dd["confidence"] > dict_detection["confidence"]:
|
||||
dict_detection = dd
|
||||
except Exception as e:
|
||||
logger.warning("Dictionary detection failed: %s", e)
|
||||
return dict_detection
|
||||
|
||||
|
||||
def _insert_syllable_dividers(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
session_id: str,
|
||||
syllable_mode: str,
|
||||
dict_detection: Dict[str, Any],
|
||||
en_col_type: Optional[str],
|
||||
all_content_cols: set,
|
||||
total_cols: int,
|
||||
) -> int:
|
||||
"""Insert syllable dividers for dictionary pages. Returns insertion count."""
|
||||
syllable_insertions = 0
|
||||
if syllable_mode == "none" or img_bgr is None:
|
||||
if syllable_mode == "none":
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
t = cell.get("text", "")
|
||||
if "|" in t:
|
||||
cell["text"] = t.replace("|", "")
|
||||
return syllable_insertions
|
||||
|
||||
_syllable_eligible = False
|
||||
if syllable_mode in ("all", "de", "en"):
|
||||
_syllable_eligible = True
|
||||
elif (dict_detection.get("is_dictionary")
|
||||
and dict_detection.get("article_col_index") is not None):
|
||||
_syllable_eligible = True
|
||||
|
||||
_syllable_col_filter: Optional[set] = None
|
||||
if syllable_mode == "en":
|
||||
_syllable_col_filter = {en_col_type} if en_col_type else set()
|
||||
elif syllable_mode == "de":
|
||||
if en_col_type and total_cols >= 3:
|
||||
_syllable_col_filter = all_content_cols - {en_col_type}
|
||||
|
||||
if _syllable_eligible:
|
||||
try:
|
||||
from cv_syllable_detect import insert_syllable_dividers
|
||||
force_syllables = (syllable_mode in ("all", "de", "en"))
|
||||
syllable_insertions = insert_syllable_dividers(
|
||||
zones_data, img_bgr, session_id,
|
||||
force=force_syllables,
|
||||
col_filter=_syllable_col_filter,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Syllable insertion failed: %s", e)
|
||||
|
||||
return syllable_insertions
|
||||
|
||||
|
||||
def _split_merged_words(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
) -> None:
|
||||
"""Split merged words using dictionary lookup."""
|
||||
try:
|
||||
from cv_review import _try_split_merged_word, _SPELL_AVAILABLE
|
||||
if not _SPELL_AVAILABLE:
|
||||
return
|
||||
split_count = 0
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text:
|
||||
continue
|
||||
parts = []
|
||||
changed = False
|
||||
for token in text.split():
|
||||
clean = token
|
||||
bracket_pos = clean.find('[')
|
||||
suffix_ipa = ""
|
||||
if bracket_pos > 0:
|
||||
suffix_ipa = clean[bracket_pos:]
|
||||
clean = clean[:bracket_pos]
|
||||
suffix_punct = ""
|
||||
stripped = clean.rstrip(".,!?;:'\")")
|
||||
if stripped != clean:
|
||||
suffix_punct = clean[len(stripped):]
|
||||
clean = stripped
|
||||
suffix = suffix_punct + suffix_ipa
|
||||
contraction = ""
|
||||
if "'" in clean and clean.index("'") >= 2:
|
||||
apos_pos = clean.index("'")
|
||||
contraction = clean[apos_pos:]
|
||||
clean = clean[:apos_pos]
|
||||
suffix = contraction + suffix
|
||||
if len(clean) >= 4 and clean.isalpha():
|
||||
split = _try_split_merged_word(clean)
|
||||
if split:
|
||||
parts.append(split + suffix)
|
||||
changed = True
|
||||
continue
|
||||
parts.append(token)
|
||||
if changed:
|
||||
cell["text"] = " ".join(parts)
|
||||
split_count += 1
|
||||
if split_count:
|
||||
logger.info("build-grid session %s: split %d merged words", session_id, split_count)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def _fix_ipa_spacing(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Ensure space before IPA/phonetic brackets: 'word[ipa]' -> 'word [ipa]'."""
|
||||
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if text and "[" in text:
|
||||
fixed = _IPA_NOSPACE_RE.sub(r'\1 \2', text)
|
||||
if fixed != text:
|
||||
cell["text"] = fixed
|
||||
|
||||
|
||||
def _run_spell_checker(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
session_id: str,
|
||||
en_col_type: Optional[str],
|
||||
total_cols: int,
|
||||
) -> None:
|
||||
"""Run SmartSpellChecker on all cells."""
|
||||
try:
|
||||
from smart_spell import SmartSpellChecker
|
||||
_ssc = SmartSpellChecker()
|
||||
spell_fix_count = 0
|
||||
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if not text or not text.strip():
|
||||
continue
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
|
||||
if total_cols >= 3 and en_col_type:
|
||||
lang = "en" if ct == en_col_type else "de"
|
||||
elif total_cols <= 2:
|
||||
lang = "auto"
|
||||
else:
|
||||
lang = "auto"
|
||||
|
||||
result = _ssc.correct_text(text, lang=lang)
|
||||
if result.changed:
|
||||
cell["text"] = result.corrected
|
||||
spell_fix_count += 1
|
||||
|
||||
if spell_fix_count:
|
||||
logger.info(
|
||||
"build-grid session %s: SmartSpellChecker fixed %d cells",
|
||||
session_id, spell_fix_count,
|
||||
)
|
||||
except ImportError:
|
||||
logger.debug("SmartSpellChecker not available in build-grid")
|
||||
except Exception as e:
|
||||
logger.warning("SmartSpellChecker error in build-grid: %s", e)
|
||||
|
||||
|
||||
def _remove_empty_columns(zones_data: List[Dict[str, Any]]) -> None:
|
||||
"""Remove columns that have no cells assigned."""
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
used_col_indices = {c.get("col_index") for c in cells}
|
||||
old_cols = z.get("columns", [])
|
||||
new_cols = [c for c in old_cols if c.get("col_index", c.get("index", -1)) in used_col_indices]
|
||||
if len(new_cols) < len(old_cols):
|
||||
old_to_new = {}
|
||||
for new_i, col in enumerate(new_cols):
|
||||
old_i = col.get("col_index", col.get("index", new_i))
|
||||
old_to_new[old_i] = new_i
|
||||
col["col_index"] = new_i
|
||||
col["index"] = new_i
|
||||
col["label"] = f"column_{new_i + 1}" if len(new_cols) > 1 else "column_text"
|
||||
for cell in cells:
|
||||
old_ci = cell.get("col_index", 0)
|
||||
cell["col_index"] = old_to_new.get(old_ci, old_ci)
|
||||
cell["col_type"] = f"column_{cell['col_index'] + 1}" if len(new_cols) > 1 else "column_text"
|
||||
z["columns"] = new_cols
|
||||
|
||||
|
||||
def _assemble_result(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
ipa_mode: str,
|
||||
syllable_mode: str,
|
||||
ipa_target_cols: set,
|
||||
skip_ipa: bool,
|
||||
dict_detection: Dict[str, Any],
|
||||
page_number_info: Optional[Dict],
|
||||
boxes_detected: int,
|
||||
recovered_count: int,
|
||||
duration: float,
|
||||
syllable_insertions: int,
|
||||
) -> dict:
|
||||
"""Build the final result dict (Phase 6)."""
|
||||
total_cells = sum(len(z.get("cells", [])) for z in zones_data)
|
||||
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
|
||||
|
||||
# Collect color statistics
|
||||
color_stats: Dict[str, int] = {}
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
for wb in cell.get("word_boxes", []):
|
||||
cn = wb.get("color_name", "black")
|
||||
color_stats[cn] = color_stats.get(cn, 0) + 1
|
||||
|
||||
# Compute layout metrics
|
||||
all_content_row_heights: List[float] = []
|
||||
for z in zones_data:
|
||||
for row in z.get("rows", []):
|
||||
if not row.get("is_header", False):
|
||||
h = row.get("y_max_px", 0) - row.get("y_min_px", 0)
|
||||
if h > 0:
|
||||
all_content_row_heights.append(h)
|
||||
avg_row_height = (
|
||||
sum(all_content_row_heights) / len(all_content_row_heights)
|
||||
if all_content_row_heights else 30.0
|
||||
)
|
||||
font_size_suggestion = max(10, int(avg_row_height * 0.6))
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"zones": zones_data,
|
||||
"boxes_detected": boxes_detected,
|
||||
"summary": {
|
||||
"total_zones": len(zones_data),
|
||||
"total_columns": total_columns,
|
||||
"total_rows": total_rows,
|
||||
"total_cells": total_cells,
|
||||
"total_words": len(all_words),
|
||||
"recovered_colored": recovered_count,
|
||||
"color_stats": color_stats,
|
||||
},
|
||||
"formatting": {
|
||||
"bold_columns": [],
|
||||
"header_rows": [],
|
||||
},
|
||||
"layout_metrics": {
|
||||
"page_width_px": img_w,
|
||||
"page_height_px": img_h,
|
||||
"avg_row_height_px": round(avg_row_height, 1),
|
||||
"font_size_suggestion_px": font_size_suggestion,
|
||||
},
|
||||
"dictionary_detection": {
|
||||
"is_dictionary": dict_detection.get("is_dictionary", False),
|
||||
"confidence": dict_detection.get("confidence", 0.0),
|
||||
"signals": dict_detection.get("signals", {}),
|
||||
"article_col_index": dict_detection.get("article_col_index"),
|
||||
"headword_col_index": dict_detection.get("headword_col_index"),
|
||||
},
|
||||
"processing_modes": {
|
||||
"ipa_mode": ipa_mode,
|
||||
"syllable_mode": syllable_mode,
|
||||
"ipa_applied": bool(ipa_target_cols) if not skip_ipa else False,
|
||||
"syllables_applied": syllable_insertions > 0,
|
||||
},
|
||||
"page_number": page_number_info,
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
489
klausur-service/backend/grid_build_text_ops.py
Normal file
489
klausur-service/backend/grid_build_text_ops.py
Normal file
@@ -0,0 +1,489 @@
|
||||
"""
|
||||
Grid Build Text Ops — Phase 4+5a: Color annotation, heading detection,
|
||||
parenthesis fix, IPA phonetic correction, page ref extraction, and
|
||||
slash-IPA conversion.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
from cv_color_detect import detect_word_colors
|
||||
from cv_ocr_engines import (
|
||||
fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa,
|
||||
_lookup_ipa,
|
||||
)
|
||||
from grid_editor_helpers import (
|
||||
_detect_heading_rows_by_color,
|
||||
_detect_heading_rows_by_single_cell,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _process_text(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
img_bgr: Any,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
ipa_mode: str,
|
||||
page_number_info: Optional[Dict],
|
||||
) -> Dict[str, Any]:
|
||||
"""Run color annotation, heading detection, IPA correction, and page refs.
|
||||
|
||||
Args:
|
||||
zones_data: List of zone dicts (modified in place).
|
||||
img_bgr: BGR image array (or None).
|
||||
img_w: Image width.
|
||||
img_h: Image height.
|
||||
ipa_mode: IPA processing mode.
|
||||
page_number_info: Existing page number metadata (may be None).
|
||||
|
||||
Returns:
|
||||
Dict with keys: en_col_type, ipa_target_cols, all_content_cols,
|
||||
skip_ipa, page_number_info.
|
||||
"""
|
||||
# 5. Color annotation on final word_boxes in cells
|
||||
if img_bgr is not None:
|
||||
all_wb: List[Dict] = []
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
all_wb.extend(cell.get("word_boxes", []))
|
||||
detect_word_colors(img_bgr, all_wb)
|
||||
|
||||
# 5a. Heading detection by color + height
|
||||
heading_count = _detect_heading_rows_by_color(zones_data, img_w, img_h)
|
||||
if heading_count:
|
||||
logger.info("Detected %d heading rows by color+height", heading_count)
|
||||
|
||||
# 5b. Fix unmatched parentheses in cell text
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
text = cell.get("text", "")
|
||||
if ")" in text and "(" not in text:
|
||||
cell["text"] = "(" + text
|
||||
|
||||
# 5c. IPA phonetic correction
|
||||
all_cells = [cell for z in zones_data for cell in z.get("cells", [])]
|
||||
total_cols = sum(len(z.get("columns", [])) for z in zones_data)
|
||||
en_col_type = None
|
||||
ipa_target_cols: set = set()
|
||||
all_content_cols: set = set()
|
||||
skip_ipa = (ipa_mode == "none")
|
||||
|
||||
# When ipa_mode=none, strip ALL square brackets from ALL content columns
|
||||
if skip_ipa:
|
||||
_SQUARE_BRACKET_RE_NONE = re.compile(r'\s*\[[^\]]+\]')
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "[" in text:
|
||||
stripped = _SQUARE_BRACKET_RE_NONE.sub("", text)
|
||||
if stripped != text:
|
||||
cell["text"] = stripped.strip()
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
if not skip_ipa and total_cols >= 3:
|
||||
en_col_type, ipa_target_cols, all_content_cols = _run_ipa_correction(
|
||||
all_cells, total_cols, ipa_mode, zones_data
|
||||
)
|
||||
elif not skip_ipa:
|
||||
# Collect all_content_cols even when <3 cols (needed by finalize)
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||
all_content_cols.add(ct)
|
||||
|
||||
# 5e. Heading detection by single-cell rows
|
||||
single_heading_count = _detect_heading_rows_by_single_cell(zones_data, img_w, img_h)
|
||||
if single_heading_count:
|
||||
logger.info("Detected %d heading rows by single-cell heuristic", single_heading_count)
|
||||
|
||||
# 5f. Strip IPA from headings
|
||||
for z in zones_data:
|
||||
for cell in z.get("cells", []):
|
||||
if cell.get("col_type") != "heading":
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
stripped = re.sub(r'\s*\[[^\]]*\]\s*$', '', text).strip()
|
||||
if stripped and stripped != text:
|
||||
cell["text"] = stripped
|
||||
|
||||
# 5g. Extract page_ref cells and footer rows
|
||||
_extract_page_refs_and_footers(zones_data, page_number_info)
|
||||
|
||||
# 5h. Convert slash-delimited IPA to bracket notation
|
||||
_convert_slash_ipa(zones_data, skip_ipa, en_col_type)
|
||||
|
||||
return {
|
||||
"en_col_type": en_col_type,
|
||||
"ipa_target_cols": ipa_target_cols,
|
||||
"all_content_cols": all_content_cols,
|
||||
"skip_ipa": skip_ipa,
|
||||
"page_number_info": page_number_info,
|
||||
}
|
||||
|
||||
|
||||
def _run_ipa_correction(
|
||||
all_cells: List[Dict],
|
||||
total_cols: int,
|
||||
ipa_mode: str,
|
||||
zones_data: List[Dict[str, Any]],
|
||||
) -> Tuple[Optional[str], set, set]:
|
||||
"""Run IPA correction on cells. Returns (en_col_type, ipa_target_cols, all_content_cols)."""
|
||||
en_col_type = None
|
||||
all_content_cols: set = set()
|
||||
|
||||
# Detect English headword column via IPA signals
|
||||
col_ipa_count: Dict[str, int] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
txt = cell.get("text", "") or ""
|
||||
if txt.strip():
|
||||
all_content_cols.add(ct)
|
||||
if '[' in txt or _text_has_garbled_ipa(txt):
|
||||
col_ipa_count[ct] = col_ipa_count.get(ct, 0) + 1
|
||||
if col_ipa_count:
|
||||
en_col_type = max(col_ipa_count, key=col_ipa_count.get)
|
||||
elif ipa_mode == "all":
|
||||
col_cell_count: Dict[str, int] = {}
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct.startswith("column_") and (cell.get("text") or "").strip():
|
||||
col_cell_count[ct] = col_cell_count.get(ct, 0) + 1
|
||||
if col_cell_count:
|
||||
en_col_type = max(col_cell_count, key=col_cell_count.get)
|
||||
|
||||
# Decide which columns to process based on ipa_mode
|
||||
en_ipa_target_cols: set = set()
|
||||
de_ipa_target_cols: set = set()
|
||||
if ipa_mode in ("auto", "en"):
|
||||
if en_col_type:
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
elif ipa_mode == "de":
|
||||
de_ipa_target_cols = all_content_cols - {en_col_type} if en_col_type else all_content_cols
|
||||
elif ipa_mode == "all":
|
||||
if en_col_type:
|
||||
en_ipa_target_cols.add(en_col_type)
|
||||
de_ipa_target_cols = all_content_cols - en_ipa_target_cols
|
||||
|
||||
# --- Strip IPA from columns NOT in the target set ---
|
||||
_SQUARE_BRACKET_RE = re.compile(r'\s*\[[^\]]+\]')
|
||||
strip_en_ipa = en_col_type and en_col_type not in en_ipa_target_cols
|
||||
if strip_en_ipa or ipa_mode == "none":
|
||||
strip_cols = {en_col_type} if strip_en_ipa and ipa_mode != "none" else all_content_cols
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if ct not in strip_cols:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "[" in text:
|
||||
stripped = _SQUARE_BRACKET_RE.sub("", text)
|
||||
if stripped != text:
|
||||
cell["text"] = stripped.strip()
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# --- English IPA (Britfone + eng_to_ipa) ---
|
||||
if en_ipa_target_cols:
|
||||
for cell in all_cells:
|
||||
ct = cell.get("col_type")
|
||||
if ct in en_ipa_target_cols:
|
||||
cell["_orig_col_type"] = ct
|
||||
cell["col_type"] = "column_en"
|
||||
_pre_ipa = {id(c): c.get("text", "") for c in all_cells}
|
||||
fix_cell_phonetics(all_cells, pronunciation="british")
|
||||
for cell in all_cells:
|
||||
orig = cell.pop("_orig_col_type", None)
|
||||
if orig:
|
||||
cell["col_type"] = orig
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# --- German IPA (wiki-pronunciation-dict + epitran) ---
|
||||
if de_ipa_target_cols:
|
||||
from cv_ipa_german import insert_german_ipa
|
||||
insert_german_ipa(all_cells, de_ipa_target_cols)
|
||||
|
||||
ipa_target_cols = en_ipa_target_cols | de_ipa_target_cols
|
||||
|
||||
# Mark cells whose text was changed by IPA correction
|
||||
for cell in all_cells:
|
||||
if cell.get("text", "") != _pre_ipa.get(id(cell), ""):
|
||||
cell["_ipa_corrected"] = True
|
||||
|
||||
# 5d. Fix IPA continuation cells
|
||||
skip_ipa = (ipa_mode == "none")
|
||||
_REAL_IPA_CHARS = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
ipa_cont_fixed = 0
|
||||
for z in ([] if skip_ipa else zones_data):
|
||||
rows_sorted = sorted(z.get("rows", []), key=lambda r: r["index"])
|
||||
z_cells = z.get("cells", [])
|
||||
for idx, row in enumerate(rows_sorted):
|
||||
if idx == 0:
|
||||
continue
|
||||
ri = row["index"]
|
||||
row_cells = [c for c in z_cells if c.get("row_index") == ri]
|
||||
for cell in row_cells:
|
||||
ct = cell.get("col_type", "")
|
||||
if not ct.startswith("column_"):
|
||||
continue
|
||||
cell_text = (cell.get("text") or "").strip()
|
||||
if not cell_text:
|
||||
wb_texts = [w.get("text", "")
|
||||
for w in cell.get("word_boxes", [])]
|
||||
cell_text = " ".join(wb_texts).strip()
|
||||
if not cell_text:
|
||||
continue
|
||||
|
||||
is_bracketed = (
|
||||
cell_text.startswith('[') and cell_text.endswith(']')
|
||||
)
|
||||
|
||||
if is_bracketed:
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
if re.search(r'\[[^\]]*[ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ][^\]]*\]', cell_text):
|
||||
continue
|
||||
else:
|
||||
content_cells_in_row = [
|
||||
c for c in row_cells
|
||||
if c.get("col_type", "").startswith("column_")
|
||||
and c.get("col_type") != "column_1"
|
||||
]
|
||||
if len(content_cells_in_row) != 1:
|
||||
continue
|
||||
if not _text_has_garbled_ipa(cell_text):
|
||||
continue
|
||||
if any(c in _REAL_IPA_CHARS for c in cell_text):
|
||||
continue
|
||||
_words_in_text = re.findall(r'[A-Za-zÄÖÜäöüß]{3,}', cell_text)
|
||||
if len(_words_in_text) >= 3:
|
||||
continue
|
||||
|
||||
# Find headword in previous row, same column
|
||||
prev_ri = rows_sorted[idx - 1]["index"]
|
||||
prev_same_col = [
|
||||
c for c in z_cells
|
||||
if c.get("row_index") == prev_ri
|
||||
and c.get("col_type") == ct
|
||||
]
|
||||
if not prev_same_col:
|
||||
continue
|
||||
prev_text = prev_same_col[0].get("text", "")
|
||||
fixed = fix_ipa_continuation_cell(
|
||||
cell_text, prev_text, pronunciation="british",
|
||||
)
|
||||
if fixed != cell_text:
|
||||
cell["text"] = fixed
|
||||
ipa_cont_fixed += 1
|
||||
logger.info(
|
||||
"IPA continuation R%d %s: '%s' -> '%s'",
|
||||
ri, ct, cell_text, fixed,
|
||||
)
|
||||
if ipa_cont_fixed:
|
||||
logger.info("Fixed %d IPA continuation cells", ipa_cont_fixed)
|
||||
|
||||
return en_col_type, ipa_target_cols, all_content_cols
|
||||
|
||||
|
||||
def _extract_page_refs_and_footers(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
page_number_info: Optional[Dict],
|
||||
) -> None:
|
||||
"""Extract page_ref cells and footer rows from content zones.
|
||||
|
||||
Modifies zones_data in place. Updates page_number_info if a page number
|
||||
footer is found.
|
||||
"""
|
||||
_REAL_IPA_CHARS_SET = set("ˈˌəɪɛɒʊʌæɑɔʃʒθðŋ")
|
||||
_PAGE_REF_RE = re.compile(r'^[pP,]?\s*\.?\s*\d+$')
|
||||
_NUMBER_WORDS = {
|
||||
"one", "two", "three", "four", "five", "six", "seven",
|
||||
"eight", "nine", "ten", "eleven", "twelve", "thirteen",
|
||||
"fourteen", "fifteen", "sixteen", "seventeen", "eighteen",
|
||||
"nineteen", "twenty", "thirty", "forty", "fifty", "sixty",
|
||||
"seventy", "eighty", "ninety", "hundred", "thousand", "and",
|
||||
"einhundert", "zweihundert", "dreihundert", "vierhundert",
|
||||
"und", "zehn", "zwanzig", "dreißig", "vierzig", "fünfzig",
|
||||
}
|
||||
|
||||
for z in zones_data:
|
||||
if z.get("zone_type") != "content":
|
||||
continue
|
||||
cells = z.get("cells", [])
|
||||
rows = z.get("rows", [])
|
||||
if not rows:
|
||||
continue
|
||||
|
||||
# Extract column_1 cells that look like page references
|
||||
page_refs = []
|
||||
page_ref_cell_ids = set()
|
||||
for cell in cells:
|
||||
if cell.get("col_type") != "column_1":
|
||||
continue
|
||||
text = (cell.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
if not _PAGE_REF_RE.match(text):
|
||||
continue
|
||||
page_refs.append({
|
||||
"row_index": cell.get("row_index"),
|
||||
"text": text,
|
||||
"bbox_pct": cell.get("bbox_pct", {}),
|
||||
})
|
||||
page_ref_cell_ids.add(cell.get("cell_id"))
|
||||
|
||||
# Detect footer: last non-header row if it has only 1 cell
|
||||
footer_rows = []
|
||||
non_header_rows = [r for r in rows if not r.get("is_header")]
|
||||
if non_header_rows:
|
||||
last_row = non_header_rows[-1]
|
||||
last_ri = last_row["index"]
|
||||
last_cells = [c for c in z["cells"]
|
||||
if c.get("row_index") == last_ri]
|
||||
if len(last_cells) == 1:
|
||||
text = (last_cells[0].get("text") or "").strip()
|
||||
has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
|
||||
has_commas = ',' in text
|
||||
text_words = set(text.lower().split())
|
||||
is_written_number = len(text_words) >= 2 and text_words.issubset(_NUMBER_WORDS)
|
||||
is_page_number = len(text) <= 20 or is_written_number
|
||||
if (text and not has_real_ipa and not has_commas
|
||||
and is_page_number
|
||||
and last_cells[0].get("col_type") != "heading"):
|
||||
footer_rows.append({
|
||||
"row_index": last_ri,
|
||||
"text": text,
|
||||
"bbox_pct": last_cells[0].get("bbox_pct", {}),
|
||||
})
|
||||
|
||||
# Classify footer rows
|
||||
page_number_footers = []
|
||||
other_footers = []
|
||||
for fr in footer_rows:
|
||||
ft = fr["text"].strip()
|
||||
digits = "".join(c for c in ft if c.isdigit())
|
||||
if digits and re.match(r'^[\d\s.]+$', ft):
|
||||
page_number_footers.append(fr)
|
||||
elif ft.lower().split() and set(ft.lower().split()).issubset(_NUMBER_WORDS):
|
||||
page_number_footers.append(fr)
|
||||
else:
|
||||
other_footers.append(fr)
|
||||
|
||||
# Remove page-number footer rows from grid entirely
|
||||
if page_number_footers:
|
||||
pn_ris = {fr["row_index"] for fr in page_number_footers}
|
||||
z["cells"] = [c for c in z["cells"] if c.get("row_index") not in pn_ris]
|
||||
z["rows"] = [r for r in z["rows"] if r["index"] not in pn_ris]
|
||||
pn_text = page_number_footers[0]["text"].strip()
|
||||
pn_digits = "".join(c for c in pn_text if c.isdigit())
|
||||
if not page_number_info:
|
||||
page_number_info = {
|
||||
"text": pn_text,
|
||||
"y_pct": page_number_footers[0].get("bbox_pct", {}).get("y", 95),
|
||||
}
|
||||
if pn_digits:
|
||||
page_number_info["number"] = int(pn_digits)
|
||||
|
||||
# Mark remaining footer rows
|
||||
if other_footers:
|
||||
footer_ris = {fr["row_index"] for fr in other_footers}
|
||||
for r in z["rows"]:
|
||||
if r["index"] in footer_ris:
|
||||
r["is_footer"] = True
|
||||
for c in z["cells"]:
|
||||
if c.get("row_index") in footer_ris:
|
||||
c["col_type"] = "footer"
|
||||
|
||||
if page_refs or footer_rows:
|
||||
logger.info(
|
||||
"Extracted %d page_refs + %d footer rows (%d page numbers removed) from zone %d",
|
||||
len(page_refs), len(footer_rows), len(page_number_footers),
|
||||
z.get("zone_index", 0),
|
||||
)
|
||||
|
||||
if page_refs:
|
||||
z["page_refs"] = page_refs
|
||||
if other_footers:
|
||||
z["footer"] = other_footers
|
||||
|
||||
|
||||
def _convert_slash_ipa(
|
||||
zones_data: List[Dict[str, Any]],
|
||||
skip_ipa: bool,
|
||||
en_col_type: Optional[str],
|
||||
) -> None:
|
||||
"""Convert slash-delimited IPA to bracket notation.
|
||||
|
||||
Dictionary-style pages print IPA between slashes: "tiger /'taiga/"
|
||||
"""
|
||||
_SLASH_IPA_RE = re.compile(
|
||||
r'(\b[a-zA-Z]+[²³¹]?)\s*' # headword (capture group 1)
|
||||
r"(/[^/]{2,}/)" # /ipa/ (capture group 2), min 2 chars
|
||||
)
|
||||
_STANDALONE_SLASH_IPA_RE = re.compile(r'^/([^/]{2,})/')
|
||||
_SLASH_IPA_REJECT_RE = re.compile(r'[\s(),]')
|
||||
slash_ipa_fixed = 0
|
||||
|
||||
for z in ([] if skip_ipa else zones_data):
|
||||
for cell in z.get("cells", []):
|
||||
if en_col_type and cell.get("col_type") != en_col_type:
|
||||
continue
|
||||
text = cell.get("text", "")
|
||||
if "/" not in text:
|
||||
continue
|
||||
|
||||
def _replace_slash_ipa(m: re.Match) -> str:
|
||||
nonlocal slash_ipa_fixed
|
||||
headword = m.group(1)
|
||||
ocr_ipa = m.group(2)
|
||||
inner_raw = ocr_ipa.strip("/").strip()
|
||||
if _SLASH_IPA_REJECT_RE.search(inner_raw):
|
||||
return m.group(0)
|
||||
clean_hw = re.sub(r'[²³¹\d]', '', headword).strip()
|
||||
ipa = _lookup_ipa(clean_hw, "british") if clean_hw else None
|
||||
if ipa:
|
||||
slash_ipa_fixed += 1
|
||||
return f"{headword} [{ipa}]"
|
||||
inner = inner_raw.lstrip("'").strip()
|
||||
if inner:
|
||||
slash_ipa_fixed += 1
|
||||
return f"{headword} [{inner}]"
|
||||
return m.group(0)
|
||||
|
||||
new_text = _SLASH_IPA_RE.sub(_replace_slash_ipa, text)
|
||||
|
||||
_AFTER_BRACKET_SLASH = re.compile(r'(?<=\])\s*(/[^/]{2,}/)')
|
||||
|
||||
def _replace_trailing_slash(m: re.Match) -> str:
|
||||
nonlocal slash_ipa_fixed
|
||||
inner = m.group(1).strip("/").strip().lstrip("'").strip()
|
||||
if _SLASH_IPA_REJECT_RE.search(inner):
|
||||
return m.group(0)
|
||||
if inner:
|
||||
slash_ipa_fixed += 1
|
||||
return f" [{inner}]"
|
||||
return m.group(0)
|
||||
new_text = _AFTER_BRACKET_SLASH.sub(_replace_trailing_slash, new_text)
|
||||
|
||||
if new_text == text:
|
||||
m = _STANDALONE_SLASH_IPA_RE.match(text)
|
||||
if m:
|
||||
inner = m.group(1).strip()
|
||||
if not _SLASH_IPA_REJECT_RE.search(inner):
|
||||
inner = inner.lstrip("'").strip()
|
||||
if inner:
|
||||
new_text = "[" + inner + "]" + text[m.end():]
|
||||
slash_ipa_fixed += 1
|
||||
|
||||
if new_text != text:
|
||||
cell["text"] = new_text
|
||||
|
||||
if slash_ipa_fixed:
|
||||
logger.info("Step 5h: converted %d slash-IPA to bracket notation", slash_ipa_fixed)
|
||||
462
klausur-service/backend/grid_build_zones.py
Normal file
462
klausur-service/backend/grid_build_zones.py
Normal file
@@ -0,0 +1,462 @@
|
||||
"""
|
||||
Grid Build Zones — Phase 2: Image processing, graphic detection, box/zone
|
||||
detection and zone-aware grid building.
|
||||
|
||||
Extracted from grid_build_core.py for maintainability.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from cv_box_detect import detect_boxes, split_page_into_zones
|
||||
from cv_graphic_detect import detect_graphic_elements
|
||||
from cv_color_detect import recover_colored_text
|
||||
from cv_vocab_types import PageZone
|
||||
from ocr_pipeline_session_store import get_session_image
|
||||
|
||||
from grid_editor_helpers import (
|
||||
_filter_border_strip_words,
|
||||
_filter_border_ghosts,
|
||||
_words_in_zone,
|
||||
_PIPE_RE_VSPLIT,
|
||||
_detect_vertical_dividers,
|
||||
_split_zone_at_vertical_dividers,
|
||||
_merge_content_zones_across_boxes,
|
||||
_build_zone_grid,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _build_zones(
|
||||
session_id: str,
|
||||
session: dict,
|
||||
all_words: List[Dict[str, Any]],
|
||||
graphic_rects: List[Dict[str, int]],
|
||||
content_x: int,
|
||||
content_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> Dict[str, Any]:
|
||||
"""Load image, detect graphics/boxes, build zone-aware grids.
|
||||
|
||||
Returns a dict with keys:
|
||||
zones_data, boxes_detected, recovered_count, border_prefiltered,
|
||||
img_bgr, all_words (modified in-place but returned for clarity).
|
||||
"""
|
||||
zones_data: List[Dict[str, Any]] = []
|
||||
boxes_detected = 0
|
||||
recovered_count = 0
|
||||
border_prefiltered = False
|
||||
img_bgr = None
|
||||
|
||||
# 3. Load image for box detection
|
||||
img_png = await get_session_image(session_id, "cropped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "dewarped")
|
||||
if not img_png:
|
||||
img_png = await get_session_image(session_id, "original")
|
||||
|
||||
if img_png:
|
||||
# Decode image for color detection + box detection
|
||||
arr = np.frombuffer(img_png, dtype=np.uint8)
|
||||
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||
|
||||
if img_bgr is not None:
|
||||
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
|
||||
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
|
||||
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
|
||||
if fresh_graphics:
|
||||
fresh_rects = [
|
||||
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
|
||||
for g in fresh_graphics
|
||||
]
|
||||
graphic_rects.extend(fresh_rects)
|
||||
logger.info(
|
||||
"build-grid session %s: detected %d graphic region(s) via CV",
|
||||
session_id, len(fresh_graphics),
|
||||
)
|
||||
# Hard-filter words inside newly detected graphic regions
|
||||
before = len(all_words)
|
||||
all_words[:] = [
|
||||
w for w in all_words
|
||||
if not any(
|
||||
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in fresh_rects
|
||||
)
|
||||
]
|
||||
removed = before - len(all_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
|
||||
session_id, removed, len(fresh_rects),
|
||||
)
|
||||
|
||||
# --- Recover colored text that OCR missed (before grid building) ---
|
||||
recovered = recover_colored_text(img_bgr, all_words)
|
||||
if recovered and graphic_rects:
|
||||
# Filter recovered chars inside graphic regions
|
||||
recovered = [
|
||||
r for r in recovered
|
||||
if not any(
|
||||
gr["x"] <= r["left"] + r.get("width", 0) / 2 <= gr["x"] + gr["w"]
|
||||
and gr["y"] <= r["top"] + r.get("height", 0) / 2 <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
]
|
||||
if recovered:
|
||||
recovered_count = len(recovered)
|
||||
all_words.extend(recovered)
|
||||
logger.info(
|
||||
"build-grid session %s: +%d recovered colored words",
|
||||
session_id, recovered_count,
|
||||
)
|
||||
|
||||
# Detect bordered boxes
|
||||
boxes = detect_boxes(
|
||||
img_bgr,
|
||||
content_x=content_x,
|
||||
content_w=content_w,
|
||||
content_y=content_y,
|
||||
content_h=content_h,
|
||||
)
|
||||
boxes_detected = len(boxes)
|
||||
|
||||
if boxes:
|
||||
# Filter border ghost words before grid building
|
||||
all_words_new, ghost_count = _filter_border_ghosts(all_words, boxes)
|
||||
if ghost_count:
|
||||
all_words[:] = all_words_new
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d border ghost words",
|
||||
session_id, ghost_count,
|
||||
)
|
||||
|
||||
# Split page into zones
|
||||
page_zones = split_page_into_zones(
|
||||
content_x, content_y, content_w, content_h, boxes
|
||||
)
|
||||
|
||||
# Merge content zones separated by box zones
|
||||
page_zones = _merge_content_zones_across_boxes(
|
||||
page_zones, content_x, content_w
|
||||
)
|
||||
|
||||
# 3b. Detect vertical dividers and split content zones
|
||||
page_zones, border_prefiltered_vd = _detect_and_split_vertical_dividers(
|
||||
page_zones, all_words
|
||||
)
|
||||
|
||||
# --- First pass: build grids per zone independently ---
|
||||
zone_grids = _build_grids_per_zone(
|
||||
page_zones, all_words, img_w, img_h
|
||||
)
|
||||
border_prefiltered = border_prefiltered or any(
|
||||
zg.get("_border_prefiltered") for zg in zone_grids
|
||||
)
|
||||
|
||||
# --- Second pass: merge column boundaries from all content zones ---
|
||||
_merge_content_zone_columns(
|
||||
zone_grids, all_words, content_w, img_w, img_h, session_id
|
||||
)
|
||||
|
||||
# --- Build zones_data from zone_grids ---
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
grid = zg["grid"]
|
||||
grid.pop("_raw_columns", None)
|
||||
|
||||
zone_entry: Dict[str, Any] = {
|
||||
"zone_index": pz.index,
|
||||
"zone_type": pz.zone_type,
|
||||
"bbox_px": {
|
||||
"x": pz.x, "y": pz.y,
|
||||
"w": pz.width, "h": pz.height,
|
||||
},
|
||||
"bbox_pct": {
|
||||
"x": round(pz.x / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(pz.y / img_h * 100, 2) if img_h else 0,
|
||||
"w": round(pz.width / img_w * 100, 2) if img_w else 0,
|
||||
"h": round(pz.height / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"border": None,
|
||||
"word_count": len(zg["words"]),
|
||||
**grid,
|
||||
}
|
||||
|
||||
if pz.box:
|
||||
zone_entry["border"] = {
|
||||
"thickness": pz.box.border_thickness,
|
||||
"confidence": pz.box.confidence,
|
||||
}
|
||||
|
||||
if pz.image_overlays:
|
||||
zone_entry["image_overlays"] = pz.image_overlays
|
||||
|
||||
if pz.layout_hint:
|
||||
zone_entry["layout_hint"] = pz.layout_hint
|
||||
if pz.vsplit_group is not None:
|
||||
zone_entry["vsplit_group"] = pz.vsplit_group
|
||||
|
||||
zones_data.append(zone_entry)
|
||||
|
||||
# 4. Fallback: no boxes detected -> single zone with all words
|
||||
if not zones_data:
|
||||
before = len(all_words)
|
||||
filtered_words = [
|
||||
w for w in all_words
|
||||
if not (w.get("recovered") and len(w.get("text", "").strip()) <= 2)
|
||||
]
|
||||
removed = before - len(filtered_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||
session_id, removed,
|
||||
)
|
||||
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
||||
if bs_removed:
|
||||
border_prefiltered = True
|
||||
logger.info(
|
||||
"build-grid session %s: pre-filtered %d border-strip words",
|
||||
session_id, bs_removed,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
filtered_words, content_x, content_y, content_w, content_h,
|
||||
0, img_w, img_h,
|
||||
)
|
||||
grid.pop("_raw_columns", None)
|
||||
zones_data.append({
|
||||
"zone_index": 0,
|
||||
"zone_type": "content",
|
||||
"bbox_px": {
|
||||
"x": content_x, "y": content_y,
|
||||
"w": content_w, "h": content_h,
|
||||
},
|
||||
"bbox_pct": {
|
||||
"x": round(content_x / img_w * 100, 2) if img_w else 0,
|
||||
"y": round(content_y / img_h * 100, 2) if img_h else 0,
|
||||
"w": round(content_w / img_w * 100, 2) if img_w else 0,
|
||||
"h": round(content_h / img_h * 100, 2) if img_h else 0,
|
||||
},
|
||||
"border": None,
|
||||
"word_count": len(all_words),
|
||||
**grid,
|
||||
})
|
||||
|
||||
return {
|
||||
"zones_data": zones_data,
|
||||
"boxes_detected": boxes_detected,
|
||||
"recovered_count": recovered_count,
|
||||
"border_prefiltered": border_prefiltered,
|
||||
"img_bgr": img_bgr,
|
||||
}
|
||||
|
||||
|
||||
def _detect_and_split_vertical_dividers(
|
||||
page_zones: List[PageZone],
|
||||
all_words: List[Dict[str, Any]],
|
||||
) -> tuple:
|
||||
"""Detect vertical dividers and split content zones.
|
||||
|
||||
Returns (expanded_zones, border_prefiltered_from_vsplit).
|
||||
"""
|
||||
vsplit_group_counter = 0
|
||||
expanded_zones: List = []
|
||||
for pz in page_zones:
|
||||
if pz.zone_type != "content":
|
||||
expanded_zones.append(pz)
|
||||
continue
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
divider_xs = _detect_vertical_dividers(
|
||||
zone_words, pz.x, pz.width, pz.y, pz.height
|
||||
)
|
||||
if divider_xs:
|
||||
sub_zones = _split_zone_at_vertical_dividers(
|
||||
pz, divider_xs, vsplit_group_counter
|
||||
)
|
||||
expanded_zones.extend(sub_zones)
|
||||
vsplit_group_counter += 1
|
||||
# Remove pipe words so they don't appear in sub-zones
|
||||
pipe_ids = set(
|
||||
id(w) for w in zone_words
|
||||
if _PIPE_RE_VSPLIT.match((w.get("text") or "").strip())
|
||||
)
|
||||
all_words[:] = [w for w in all_words if id(w) not in pipe_ids]
|
||||
logger.info(
|
||||
"build-grid: vertical split zone %d at x=%s -> %d sub-zones",
|
||||
pz.index, [int(x) for x in divider_xs], len(sub_zones),
|
||||
)
|
||||
else:
|
||||
expanded_zones.append(pz)
|
||||
# Re-index zones
|
||||
for i, pz in enumerate(expanded_zones):
|
||||
pz.index = i
|
||||
return expanded_zones, False
|
||||
|
||||
|
||||
def _build_grids_per_zone(
|
||||
page_zones: List[PageZone],
|
||||
all_words: List[Dict[str, Any]],
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Build grids for each zone independently (first pass)."""
|
||||
zone_grids: List[Dict] = []
|
||||
|
||||
for pz in page_zones:
|
||||
zone_words = _words_in_zone(
|
||||
all_words, pz.y, pz.height, pz.x, pz.width
|
||||
)
|
||||
if pz.zone_type == "content":
|
||||
logger.info(
|
||||
"build-grid zone %d (%s): bounds x=%d..%d y=%d..%d -> %d/%d words",
|
||||
pz.index, pz.zone_type,
|
||||
pz.x, pz.x + pz.width, pz.y, pz.y + pz.height,
|
||||
len(zone_words), len(all_words),
|
||||
)
|
||||
# Filter recovered single-char artifacts in ALL zones
|
||||
before = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not (
|
||||
w.get("recovered")
|
||||
and len(w.get("text", "").strip()) <= 2
|
||||
)
|
||||
]
|
||||
removed = before - len(zone_words)
|
||||
if removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d recovered artifacts from %s zone %d",
|
||||
removed, pz.zone_type, pz.index,
|
||||
)
|
||||
# Filter words inside image overlay regions (merged box zones)
|
||||
if pz.image_overlays:
|
||||
before_ov = len(zone_words)
|
||||
zone_words = [
|
||||
w for w in zone_words
|
||||
if not any(
|
||||
ov["y"] <= w["top"] + w["height"] / 2 <= ov["y"] + ov["height"]
|
||||
and ov["x"] <= w["left"] + w["width"] / 2 <= ov["x"] + ov["width"]
|
||||
for ov in pz.image_overlays
|
||||
)
|
||||
]
|
||||
ov_removed = before_ov - len(zone_words)
|
||||
if ov_removed:
|
||||
logger.info(
|
||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||
ov_removed, pz.index,
|
||||
)
|
||||
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
||||
bp = False
|
||||
if bs_removed:
|
||||
bp = True
|
||||
logger.info(
|
||||
"build-grid: pre-filtered %d border-strip words from zone %d",
|
||||
bs_removed, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zone_grids.append({
|
||||
"pz": pz, "words": zone_words, "grid": grid,
|
||||
"_border_prefiltered": bp,
|
||||
})
|
||||
|
||||
return zone_grids
|
||||
|
||||
|
||||
def _merge_content_zone_columns(
|
||||
zone_grids: List[Dict[str, Any]],
|
||||
all_words: List[Dict[str, Any]],
|
||||
content_w: int,
|
||||
img_w: int,
|
||||
img_h: int,
|
||||
session_id: str,
|
||||
) -> None:
|
||||
"""Second pass: merge column boundaries from all content zones.
|
||||
|
||||
Modifies zone_grids in place.
|
||||
"""
|
||||
content_zones = [
|
||||
zg for zg in zone_grids
|
||||
if zg["pz"].zone_type == "content"
|
||||
and zg["pz"].vsplit_group is None
|
||||
]
|
||||
if len(content_zones) <= 1:
|
||||
return
|
||||
|
||||
# Collect column split points (x_min of non-first columns)
|
||||
all_split_xs: List[float] = []
|
||||
for zg in content_zones:
|
||||
raw_cols = zg["grid"].get("_raw_columns", [])
|
||||
for col in raw_cols[1:]:
|
||||
all_split_xs.append(col["x_min"])
|
||||
|
||||
if not all_split_xs:
|
||||
return
|
||||
|
||||
all_split_xs.sort()
|
||||
merge_distance = max(25, int(content_w * 0.03))
|
||||
merged_xs = [all_split_xs[0]]
|
||||
for x in all_split_xs[1:]:
|
||||
if x - merged_xs[-1] < merge_distance:
|
||||
merged_xs[-1] = (merged_xs[-1] + x) / 2
|
||||
else:
|
||||
merged_xs.append(x)
|
||||
|
||||
total_cols = len(merged_xs) + 1
|
||||
max_zone_cols = max(
|
||||
len(zg["grid"].get("_raw_columns", []))
|
||||
for zg in content_zones
|
||||
)
|
||||
|
||||
if total_cols < max_zone_cols:
|
||||
return
|
||||
|
||||
cx_min = min(w["left"] for w in all_words)
|
||||
cx_max = max(w["left"] + w["width"] for w in all_words)
|
||||
merged_columns: List[Dict[str, Any]] = []
|
||||
prev_x = cx_min
|
||||
for i, sx in enumerate(merged_xs):
|
||||
merged_columns.append({
|
||||
"index": i,
|
||||
"type": f"column_{i + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": sx,
|
||||
})
|
||||
prev_x = sx
|
||||
merged_columns.append({
|
||||
"index": len(merged_xs),
|
||||
"type": f"column_{len(merged_xs) + 1}",
|
||||
"x_min": prev_x,
|
||||
"x_max": cx_max,
|
||||
})
|
||||
|
||||
# Re-build ALL content zones with merged columns
|
||||
for zg in zone_grids:
|
||||
pz = zg["pz"]
|
||||
if pz.zone_type == "content":
|
||||
grid = _build_zone_grid(
|
||||
zg["words"], pz.x, pz.y,
|
||||
pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
global_columns=merged_columns,
|
||||
skip_first_row_header=bool(pz.image_overlays),
|
||||
)
|
||||
zg["grid"] = grid
|
||||
logger.info(
|
||||
"build-grid session %s: union of %d content "
|
||||
"zones -> %d merged columns (max single zone: %d)",
|
||||
session_id, len(content_zones),
|
||||
total_cols, max_zone_cols,
|
||||
)
|
||||
472
klausur-service/backend/vocab_worksheet_analysis_api.py
Normal file
472
klausur-service/backend/vocab_worksheet_analysis_api.py
Normal file
@@ -0,0 +1,472 @@
|
||||
"""
|
||||
Vocabulary Worksheet Analysis API - OCR export, ground truth labeling,
|
||||
extract-with-boxes, deskewed images, and learning unit generation.
|
||||
|
||||
The two large handlers (compare_ocr_methods, analyze_grid) live in
|
||||
vocab_worksheet_compare_api.py and are included via compare_router.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, Body, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, Any
|
||||
from datetime import datetime
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
|
||||
def _get_sessions():
|
||||
from vocab_worksheet_api import _sessions
|
||||
return _sessions
|
||||
|
||||
def _get_local_storage_path():
|
||||
from vocab_worksheet_api import LOCAL_STORAGE_PATH
|
||||
return LOCAL_STORAGE_PATH
|
||||
from vocab_worksheet_generation import convert_pdf_page_to_image
|
||||
|
||||
# Try to import Tesseract extractor
|
||||
try:
|
||||
from tesseract_vocab_extractor import (
|
||||
extract_bounding_boxes, TESSERACT_AVAILABLE,
|
||||
)
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
# Try to import Grid Detection Service
|
||||
try:
|
||||
from services.grid_detection_service import GridDetectionService
|
||||
GRID_SERVICE_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRID_SERVICE_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
analysis_router = APIRouter()
|
||||
|
||||
def _ocr_export_dir():
|
||||
return os.path.join(_get_local_storage_path(), "ocr-exports")
|
||||
|
||||
def _ground_truth_dir():
|
||||
return os.path.join(_get_local_storage_path(), "ground-truth")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OCR Export Endpoints (for cross-app OCR data sharing)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/ocr-export/{page_number}")
|
||||
async def save_ocr_export(session_id: str, page_number: int, data: Dict[str, Any] = Body(...)):
|
||||
"""
|
||||
Save OCR export data for cross-app sharing (admin-v2 -> studio-v2).
|
||||
|
||||
Both apps proxy to klausur-service via /klausur-api/, so this endpoint
|
||||
serves as shared storage accessible from both ports.
|
||||
"""
|
||||
|
||||
logger.info(f"Saving OCR export for session {session_id}, page {page_number}")
|
||||
|
||||
os.makedirs(_ocr_export_dir(), exist_ok=True)
|
||||
|
||||
# Save the export data
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
with open(export_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Update latest pointer
|
||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||
with open(latest_path, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"saved_at": datetime.utcnow().isoformat(),
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"message": "OCR export saved successfully",
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/ocr-export/{page_number}")
|
||||
async def load_ocr_export(session_id: str, page_number: int):
|
||||
"""Load a specific OCR export by session and page number."""
|
||||
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
|
||||
if not os.path.exists(export_path):
|
||||
raise HTTPException(status_code=404, detail="OCR export not found")
|
||||
|
||||
with open(export_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
@analysis_router.get("/ocr-export/latest")
|
||||
async def load_latest_ocr_export():
|
||||
"""Load the most recently saved OCR export data."""
|
||||
|
||||
latest_path = os.path.join(_ocr_export_dir(), "latest.json")
|
||||
|
||||
if not os.path.exists(latest_path):
|
||||
raise HTTPException(status_code=404, detail="No OCR exports found")
|
||||
|
||||
with open(latest_path, 'r', encoding='utf-8') as f:
|
||||
pointer = json.load(f)
|
||||
|
||||
session_id = pointer.get("session_id")
|
||||
page_number = pointer.get("page_number")
|
||||
|
||||
export_path = os.path.join(_ocr_export_dir(), f"{session_id}_page{page_number}.json")
|
||||
|
||||
if not os.path.exists(export_path):
|
||||
raise HTTPException(status_code=404, detail="Latest OCR export file not found")
|
||||
|
||||
with open(export_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Extract with Boxes & Deskewed Image
|
||||
# =============================================================================
|
||||
|
||||
|
||||
async def extract_entries_with_boxes(image_bytes: bytes, lang: str = "eng+deu") -> dict:
|
||||
"""Extract vocabulary entries with bounding boxes using Tesseract + GridDetectionService.
|
||||
|
||||
Returns dict with 'entries' list and 'image_width'/'image_height'.
|
||||
Each entry has row_index, english, german, example, confidence, bbox, bbox_en, bbox_de, bbox_ex.
|
||||
All bbox coordinates are in percent (0-100).
|
||||
"""
|
||||
if not TESSERACT_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="Tesseract not available")
|
||||
if not GRID_SERVICE_AVAILABLE:
|
||||
raise HTTPException(status_code=500, detail="GridDetectionService not available")
|
||||
|
||||
# Step 1: Tesseract word-level bounding boxes
|
||||
tess_result = await extract_bounding_boxes(image_bytes, lang=lang)
|
||||
words = tess_result.get("words", [])
|
||||
img_w = tess_result.get("image_width", 0)
|
||||
img_h = tess_result.get("image_height", 0)
|
||||
|
||||
if not words or img_w == 0 or img_h == 0:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 2: Convert to OCR regions (percentage-based)
|
||||
service = GridDetectionService()
|
||||
regions = service.convert_tesseract_regions(words, img_w, img_h)
|
||||
|
||||
if not regions:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 3: Detect grid
|
||||
grid_result = service.detect_grid(regions)
|
||||
|
||||
if not grid_result.cells:
|
||||
return {"entries": [], "image_width": img_w, "image_height": img_h}
|
||||
|
||||
# Step 4: Group cells by logical_row and column_type
|
||||
from services.grid_detection_service import ColumnType
|
||||
|
||||
entries = []
|
||||
for row_idx, row_cells in enumerate(grid_result.cells):
|
||||
en_text = ""
|
||||
de_text = ""
|
||||
ex_text = ""
|
||||
en_bbox = None
|
||||
de_bbox = None
|
||||
ex_bbox = None
|
||||
row_conf_sum = 0.0
|
||||
row_conf_count = 0
|
||||
|
||||
for cell in row_cells:
|
||||
cell_bbox = {"x": round(cell.x, 2), "y": round(cell.y, 2),
|
||||
"w": round(cell.width, 2), "h": round(cell.height, 2)}
|
||||
|
||||
if cell.column_type == ColumnType.ENGLISH:
|
||||
en_text = cell.text.strip()
|
||||
en_bbox = cell_bbox
|
||||
elif cell.column_type == ColumnType.GERMAN:
|
||||
de_text = cell.text.strip()
|
||||
de_bbox = cell_bbox
|
||||
elif cell.column_type == ColumnType.EXAMPLE:
|
||||
ex_text = cell.text.strip()
|
||||
ex_bbox = cell_bbox
|
||||
|
||||
if cell.text.strip():
|
||||
row_conf_sum += cell.confidence
|
||||
row_conf_count += 1
|
||||
|
||||
# Skip completely empty rows
|
||||
if not en_text and not de_text and not ex_text:
|
||||
continue
|
||||
|
||||
# Calculate whole-row bounding box
|
||||
all_bboxes = [b for b in [en_bbox, de_bbox, ex_bbox] if b is not None]
|
||||
if all_bboxes:
|
||||
row_x = min(b["x"] for b in all_bboxes)
|
||||
row_y = min(b["y"] for b in all_bboxes)
|
||||
row_right = max(b["x"] + b["w"] for b in all_bboxes)
|
||||
row_bottom = max(b["y"] + b["h"] for b in all_bboxes)
|
||||
row_bbox = {"x": round(row_x, 2), "y": round(row_y, 2),
|
||||
"w": round(row_right - row_x, 2), "h": round(row_bottom - row_y, 2)}
|
||||
else:
|
||||
row_bbox = {"x": 0, "y": 0, "w": 100, "h": 3}
|
||||
|
||||
avg_conf = round((row_conf_sum / row_conf_count * 100) if row_conf_count > 0 else 0, 1)
|
||||
|
||||
entries.append({
|
||||
"row_index": row_idx,
|
||||
"english": en_text,
|
||||
"german": de_text,
|
||||
"example": ex_text,
|
||||
"confidence": avg_conf,
|
||||
"bbox": row_bbox,
|
||||
"bbox_en": en_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
"bbox_de": de_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
"bbox_ex": ex_bbox or {"x": 0, "y": 0, "w": 0, "h": 0},
|
||||
})
|
||||
|
||||
return {"entries": entries, "image_width": img_w, "image_height": img_h}
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/extract-with-boxes/{page_number}")
|
||||
async def extract_with_boxes(session_id: str, page_number: int):
|
||||
"""Extract vocabulary entries with bounding boxes for ground truth labeling.
|
||||
|
||||
Uses Tesseract + GridDetectionService for spatial positioning.
|
||||
page_number is 0-indexed.
|
||||
"""
|
||||
logger.info(f"Extract with boxes for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# Convert page to hires image
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
|
||||
# Deskew image before OCR
|
||||
deskew_angle = 0.0
|
||||
try:
|
||||
from cv_vocab_pipeline import deskew_image_by_word_alignment, CV2_AVAILABLE
|
||||
if CV2_AVAILABLE:
|
||||
image_data, deskew_angle = deskew_image_by_word_alignment(image_data)
|
||||
logger.info(f"Deskew: {deskew_angle:.2f}° for page {page_number}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Deskew failed for page {page_number}: {e}")
|
||||
|
||||
# Cache deskewed image in session for later serving
|
||||
if "deskewed_images" not in session:
|
||||
session["deskewed_images"] = {}
|
||||
session["deskewed_images"][str(page_number)] = image_data
|
||||
|
||||
# Extract entries with boxes (now on deskewed image)
|
||||
result = await extract_entries_with_boxes(image_data)
|
||||
|
||||
# Cache in session
|
||||
if "gt_entries" not in session:
|
||||
session["gt_entries"] = {}
|
||||
session["gt_entries"][str(page_number)] = result["entries"]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"entries": result["entries"],
|
||||
"entry_count": len(result["entries"]),
|
||||
"image_width": result["image_width"],
|
||||
"image_height": result["image_height"],
|
||||
"deskew_angle": round(deskew_angle, 2),
|
||||
"deskewed": abs(deskew_angle) > 0.05,
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/deskewed-image/{page_number}")
|
||||
async def get_deskewed_image(session_id: str, page_number: int):
|
||||
"""Return the deskewed page image as PNG.
|
||||
|
||||
Falls back to the original hires image if no deskewed version is cached.
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
deskewed = session.get("deskewed_images", {}).get(str(page_number))
|
||||
|
||||
if deskewed:
|
||||
return StreamingResponse(io.BytesIO(deskewed), media_type="image/png")
|
||||
|
||||
# Fallback: render original hires image
|
||||
pdf_data = session.get("pdf_data")
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
return StreamingResponse(io.BytesIO(image_data), media_type="image/png")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Ground Truth Labeling
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/ground-truth/{page_number}")
|
||||
async def save_ground_truth(session_id: str, page_number: int, data: dict = Body(...)):
|
||||
"""Save ground truth labels for a page.
|
||||
|
||||
Expects body with 'entries' list - each entry has english, german, example,
|
||||
status ('confirmed' | 'edited' | 'skipped'), and bbox fields.
|
||||
"""
|
||||
logger.info(f"Save ground truth for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
entries = data.get("entries", [])
|
||||
if not entries:
|
||||
raise HTTPException(status_code=400, detail="No entries provided")
|
||||
|
||||
# Save in session
|
||||
session = _get_sessions()[session_id]
|
||||
if "ground_truth" not in session:
|
||||
session["ground_truth"] = {}
|
||||
session["ground_truth"][str(page_number)] = entries
|
||||
|
||||
# Also save to disk
|
||||
os.makedirs(_ground_truth_dir(), exist_ok=True)
|
||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||
gt_data = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"saved_at": datetime.now().isoformat(),
|
||||
"entry_count": len(entries),
|
||||
"entries": entries,
|
||||
}
|
||||
with open(gt_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(gt_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"Ground truth saved: {len(entries)} entries to {gt_path}")
|
||||
|
||||
confirmed = sum(1 for e in entries if e.get("status") == "confirmed")
|
||||
edited = sum(1 for e in entries if e.get("status") == "edited")
|
||||
skipped = sum(1 for e in entries if e.get("status") == "skipped")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"saved_count": len(entries),
|
||||
"confirmed": confirmed,
|
||||
"edited": edited,
|
||||
"skipped": skipped,
|
||||
"file_path": gt_path,
|
||||
}
|
||||
|
||||
|
||||
@analysis_router.get("/sessions/{session_id}/ground-truth/{page_number}")
|
||||
async def load_ground_truth(session_id: str, page_number: int):
|
||||
"""Load saved ground truth for a page."""
|
||||
logger.info(f"Load ground truth for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
# Try session cache first
|
||||
session = _get_sessions()[session_id]
|
||||
cached = session.get("ground_truth", {}).get(str(page_number))
|
||||
if cached:
|
||||
return {"success": True, "entries": cached, "source": "cache"}
|
||||
|
||||
# Try disk
|
||||
gt_path = os.path.join(_ground_truth_dir(), f"{session_id}_page{page_number}.json")
|
||||
if not os.path.exists(gt_path):
|
||||
raise HTTPException(status_code=404, detail="No ground truth found for this page")
|
||||
|
||||
with open(gt_path, 'r', encoding='utf-8') as f:
|
||||
gt_data = json.load(f)
|
||||
|
||||
return {"success": True, "entries": gt_data.get("entries", []), "source": "disk"}
|
||||
|
||||
|
||||
# ─── Learning Module Generation ─────────────────────────────────────────────
|
||||
|
||||
|
||||
class GenerateLearningUnitRequest(BaseModel):
|
||||
grade: Optional[str] = None
|
||||
generate_modules: bool = True
|
||||
|
||||
|
||||
@analysis_router.post("/sessions/{session_id}/generate-learning-unit")
|
||||
async def generate_learning_unit_endpoint(session_id: str, request: GenerateLearningUnitRequest = None):
|
||||
"""
|
||||
Create a Learning Unit from the vocabulary in this session.
|
||||
|
||||
1. Takes vocabulary from the session
|
||||
2. Creates a Learning Unit in backend-lehrer
|
||||
3. Optionally triggers MC/Cloze/QA generation
|
||||
|
||||
Returns the created unit info and generation status.
|
||||
"""
|
||||
if request is None:
|
||||
request = GenerateLearningUnitRequest()
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
vocabulary = session.get("vocabulary", [])
|
||||
|
||||
if not vocabulary:
|
||||
raise HTTPException(status_code=400, detail="No vocabulary in this session")
|
||||
|
||||
try:
|
||||
from vocab_learn_bridge import create_learning_unit, generate_learning_modules
|
||||
|
||||
# Step 1: Create Learning Unit
|
||||
result = await create_learning_unit(
|
||||
session_name=session["name"],
|
||||
vocabulary=vocabulary,
|
||||
grade=request.grade,
|
||||
)
|
||||
|
||||
# Step 2: Generate modules if requested
|
||||
if request.generate_modules:
|
||||
try:
|
||||
gen_result = await generate_learning_modules(
|
||||
unit_id=result["unit_id"],
|
||||
analysis_path=result["analysis_path"],
|
||||
)
|
||||
result["generation"] = gen_result
|
||||
except Exception as e:
|
||||
logger.warning(f"Module generation failed (unit created): {e}")
|
||||
result["generation"] = {"status": "error", "reason": str(e)}
|
||||
|
||||
return result
|
||||
|
||||
except ImportError:
|
||||
raise HTTPException(status_code=501, detail="vocab_learn_bridge module not available")
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=502, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Include compare_ocr_methods & analyze_grid from companion module
|
||||
# =============================================================================
|
||||
|
||||
from vocab_worksheet_compare_api import compare_router # noqa: E402
|
||||
|
||||
analysis_router.include_router(compare_router)
|
||||
File diff suppressed because it is too large
Load Diff
545
klausur-service/backend/vocab_worksheet_compare_api.py
Normal file
545
klausur-service/backend/vocab_worksheet_compare_api.py
Normal file
@@ -0,0 +1,545 @@
|
||||
"""
|
||||
Vocabulary Worksheet Compare & Grid Analysis API.
|
||||
|
||||
Split from vocab_worksheet_analysis_api.py — contains the two largest
|
||||
route handlers: compare_ocr_methods (~234 LOC) and analyze_grid (~255 LOC).
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from typing import Dict, Any
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
VISION_MODEL = os.getenv("VISION_MODEL", "llama3.2-vision:11b")
|
||||
|
||||
def _get_sessions():
|
||||
from vocab_worksheet_api import _sessions
|
||||
return _sessions
|
||||
from vocab_worksheet_generation import convert_pdf_page_to_image
|
||||
|
||||
# Try to import Tesseract extractor
|
||||
try:
|
||||
from tesseract_vocab_extractor import (
|
||||
run_tesseract_pipeline,
|
||||
match_positions_to_vocab, TESSERACT_AVAILABLE,
|
||||
)
|
||||
except ImportError:
|
||||
TESSERACT_AVAILABLE = False
|
||||
|
||||
# Try to import CV Pipeline
|
||||
try:
|
||||
from cv_vocab_pipeline import run_cv_pipeline, CV_PIPELINE_AVAILABLE
|
||||
except ImportError:
|
||||
CV_PIPELINE_AVAILABLE = False
|
||||
|
||||
# Try to import Grid Detection Service
|
||||
try:
|
||||
from services.grid_detection_service import GridDetectionService
|
||||
GRID_SERVICE_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRID_SERVICE_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
compare_router = APIRouter()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# OCR Compare & Grid Analysis Endpoints
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@compare_router.post("/sessions/{session_id}/compare-ocr/{page_number}")
|
||||
async def compare_ocr_methods(session_id: str, page_number: int):
|
||||
"""
|
||||
Run multiple OCR methods on a page and compare results.
|
||||
|
||||
This endpoint:
|
||||
1. Gets the page image from the session's uploaded PDF
|
||||
2. Runs Vision LLM extraction (primary method)
|
||||
3. Optionally runs Tesseract extraction
|
||||
4. Compares found vocabulary across methods
|
||||
5. Returns structured comparison results
|
||||
|
||||
page_number is 0-indexed.
|
||||
"""
|
||||
import httpx
|
||||
import time
|
||||
|
||||
logger.info(f"Compare OCR for session {session_id}, page {page_number}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# Convert page to image
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
|
||||
methods_results = {}
|
||||
all_vocab_sets = {}
|
||||
|
||||
# --- Method: Vision LLM ---
|
||||
try:
|
||||
start = time.time()
|
||||
vocab, confidence, error = await extract_vocabulary_from_image(
|
||||
image_data, f"page_{page_number + 1}.png", page_number=page_number, use_hybrid=False
|
||||
)
|
||||
duration = time.time() - start
|
||||
|
||||
vocab_list = []
|
||||
for v in vocab:
|
||||
entry = v.dict() if hasattr(v, 'dict') else (v.__dict__.copy() if hasattr(v, '__dict__') else dict(v))
|
||||
vocab_list.append({
|
||||
"english": entry.get("english", ""),
|
||||
"german": entry.get("german", ""),
|
||||
"example": entry.get("example_sentence", ""),
|
||||
})
|
||||
|
||||
methods_results["vision_llm"] = {
|
||||
"name": "Vision LLM",
|
||||
"model": VISION_MODEL,
|
||||
"duration_seconds": round(duration, 1),
|
||||
"vocabulary_count": len(vocab_list),
|
||||
"vocabulary": vocab_list,
|
||||
"confidence": confidence,
|
||||
"success": len(vocab_list) > 0 and not error,
|
||||
"error": error if error else None,
|
||||
}
|
||||
all_vocab_sets["vision_llm"] = {(v["english"].lower().strip(), v["german"].lower().strip()) for v in vocab_list if v["english"] and v["german"]}
|
||||
except Exception as e:
|
||||
logger.error(f"Vision LLM failed: {e}")
|
||||
methods_results["vision_llm"] = {
|
||||
"name": "Vision LLM",
|
||||
"model": VISION_MODEL,
|
||||
"duration_seconds": 0,
|
||||
"vocabulary_count": 0,
|
||||
"vocabulary": [],
|
||||
"confidence": 0,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
all_vocab_sets["vision_llm"] = set()
|
||||
|
||||
# --- Method: Tesseract OCR (bounding boxes + vocab extraction) ---
|
||||
if TESSERACT_AVAILABLE:
|
||||
try:
|
||||
start = time.time()
|
||||
tess_result = await run_tesseract_pipeline(image_data, lang="eng+deu")
|
||||
duration = time.time() - start
|
||||
|
||||
tess_vocab = tess_result.get("vocabulary", [])
|
||||
tess_words = tess_result.get("words", [])
|
||||
|
||||
# Store Tesseract words in session for later use (grid analysis, position matching)
|
||||
session["tesseract_words"] = tess_words
|
||||
session["tesseract_image_width"] = tess_result.get("image_width", 0)
|
||||
session["tesseract_image_height"] = tess_result.get("image_height", 0)
|
||||
session[f"tesseract_page_{page_number}"] = tess_result
|
||||
|
||||
vocab_list_tess = []
|
||||
for v in tess_vocab:
|
||||
vocab_list_tess.append({
|
||||
"english": v.get("english", ""),
|
||||
"german": v.get("german", ""),
|
||||
"example": v.get("example", ""),
|
||||
})
|
||||
|
||||
methods_results["tesseract"] = {
|
||||
"name": "Tesseract OCR",
|
||||
"model": "tesseract-ocr (eng+deu)",
|
||||
"duration_seconds": round(duration, 1),
|
||||
"vocabulary_count": len(vocab_list_tess),
|
||||
"vocabulary": vocab_list_tess,
|
||||
"confidence": 0.7 if tess_vocab else 0,
|
||||
"success": len(vocab_list_tess) > 0,
|
||||
"error": tess_result.get("error"),
|
||||
"word_count": tess_result.get("word_count", 0),
|
||||
"columns_detected": len(tess_result.get("columns", [])),
|
||||
}
|
||||
all_vocab_sets["tesseract"] = {
|
||||
(v["english"].lower().strip(), v["german"].lower().strip())
|
||||
for v in vocab_list_tess if v["english"] and v["german"]
|
||||
}
|
||||
|
||||
# Fuzzy-match: attach Tesseract bounding boxes to Vision LLM results
|
||||
if "vision_llm" in methods_results and methods_results["vision_llm"]["success"]:
|
||||
llm_vocab_with_bbox = match_positions_to_vocab(
|
||||
tess_words,
|
||||
methods_results["vision_llm"]["vocabulary"],
|
||||
tess_result.get("image_width", 1),
|
||||
tess_result.get("image_height", 1),
|
||||
)
|
||||
methods_results["vision_llm"]["vocabulary"] = llm_vocab_with_bbox
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Tesseract failed: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
methods_results["tesseract"] = {
|
||||
"name": "Tesseract OCR",
|
||||
"model": "tesseract-ocr",
|
||||
"duration_seconds": 0,
|
||||
"vocabulary_count": 0,
|
||||
"vocabulary": [],
|
||||
"confidence": 0,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
all_vocab_sets["tesseract"] = set()
|
||||
|
||||
# --- Method: CV Pipeline (Document Reconstruction) ---
|
||||
if CV_PIPELINE_AVAILABLE:
|
||||
try:
|
||||
start = time.time()
|
||||
cv_result = await run_cv_pipeline(pdf_data=pdf_data, page_number=page_number)
|
||||
duration = time.time() - start
|
||||
|
||||
cv_vocab = cv_result.vocabulary if not cv_result.error else []
|
||||
vocab_list_cv = []
|
||||
for v in cv_vocab:
|
||||
vocab_list_cv.append({
|
||||
"english": v.get("english", ""),
|
||||
"german": v.get("german", ""),
|
||||
"example": v.get("example", ""),
|
||||
})
|
||||
|
||||
methods_results["cv_pipeline"] = {
|
||||
"name": "CV Pipeline (Document Reconstruction)",
|
||||
"model": "opencv + tesseract (multi-pass)",
|
||||
"duration_seconds": round(duration, 1),
|
||||
"vocabulary_count": len(vocab_list_cv),
|
||||
"vocabulary": vocab_list_cv,
|
||||
"confidence": 0.8 if cv_vocab else 0,
|
||||
"success": len(vocab_list_cv) > 0,
|
||||
"error": cv_result.error,
|
||||
"word_count": cv_result.word_count,
|
||||
"columns_detected": cv_result.columns_detected,
|
||||
"stages": cv_result.stages,
|
||||
}
|
||||
all_vocab_sets["cv_pipeline"] = {
|
||||
(v["english"].lower().strip(), v["german"].lower().strip())
|
||||
for v in vocab_list_cv if v["english"] and v["german"]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CV Pipeline failed: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
methods_results["cv_pipeline"] = {
|
||||
"name": "CV Pipeline (Document Reconstruction)",
|
||||
"model": "opencv + tesseract (multi-pass)",
|
||||
"duration_seconds": 0,
|
||||
"vocabulary_count": 0,
|
||||
"vocabulary": [],
|
||||
"confidence": 0,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
all_vocab_sets["cv_pipeline"] = set()
|
||||
|
||||
# --- Build comparison ---
|
||||
all_unique = set()
|
||||
for vs in all_vocab_sets.values():
|
||||
all_unique |= vs
|
||||
|
||||
found_by_all = []
|
||||
found_by_some = []
|
||||
for english, german in sorted(all_unique):
|
||||
found_in = [m for m, vs in all_vocab_sets.items() if (english, german) in vs]
|
||||
entry = {"english": english, "german": german, "methods": found_in}
|
||||
if len(found_in) == len(all_vocab_sets):
|
||||
found_by_all.append(entry)
|
||||
else:
|
||||
found_by_some.append(entry)
|
||||
|
||||
total_methods = max(len(all_vocab_sets), 1)
|
||||
agreement_rate = len(found_by_all) / max(len(all_unique), 1) if all_unique else 0
|
||||
|
||||
# Find best method
|
||||
best_method = max(all_vocab_sets, key=lambda m: len(all_vocab_sets[m])) if all_vocab_sets else "vision_llm"
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number,
|
||||
"methods": methods_results,
|
||||
"comparison": {
|
||||
"found_by_all_methods": found_by_all,
|
||||
"found_by_some_methods": found_by_some,
|
||||
"total_unique_vocabulary": len(all_unique),
|
||||
"agreement_rate": agreement_rate,
|
||||
},
|
||||
"recommendation": {
|
||||
"best_method": best_method,
|
||||
"reason": f"{len(all_vocab_sets.get(best_method, set()))} Vokabeln erkannt mit hoher Konfidenz",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@compare_router.post("/sessions/{session_id}/analyze-grid/{page_number}")
|
||||
async def analyze_grid(session_id: str, page_number: int, use_tesseract: bool = Query(True)):
|
||||
"""
|
||||
Analyze the grid/table structure of a vocabulary page.
|
||||
|
||||
Hybrid approach:
|
||||
1. If Tesseract bounding boxes are available (from compare-ocr), use them for
|
||||
real spatial positions via GridDetectionService.
|
||||
2. Otherwise fall back to Vision LLM for grid structure detection.
|
||||
|
||||
page_number is 0-indexed.
|
||||
Returns GridData structure expected by the frontend GridOverlay component.
|
||||
"""
|
||||
import httpx
|
||||
import time
|
||||
|
||||
logger.info(f"Grid analysis for session {session_id}, page {page_number} (use_tesseract={use_tesseract})")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number.")
|
||||
|
||||
# Convert page to image
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
|
||||
# --- Strategy 1: Use Tesseract bounding boxes + GridDetectionService ---
|
||||
tess_page_data = session.get(f"tesseract_page_{page_number}")
|
||||
|
||||
if use_tesseract and TESSERACT_AVAILABLE and GRID_SERVICE_AVAILABLE:
|
||||
try:
|
||||
# Run Tesseract if not already cached
|
||||
if not tess_page_data:
|
||||
logger.info("Running Tesseract for grid analysis (not cached)")
|
||||
from tesseract_vocab_extractor import run_tesseract_pipeline as _run_tess
|
||||
tess_page_data = await _run_tess(image_data, lang="eng+deu")
|
||||
session[f"tesseract_page_{page_number}"] = tess_page_data
|
||||
session["tesseract_words"] = tess_page_data.get("words", [])
|
||||
session["tesseract_image_width"] = tess_page_data.get("image_width", 0)
|
||||
session["tesseract_image_height"] = tess_page_data.get("image_height", 0)
|
||||
|
||||
tess_words = tess_page_data.get("words", [])
|
||||
img_w = tess_page_data.get("image_width", 0)
|
||||
img_h = tess_page_data.get("image_height", 0)
|
||||
|
||||
if tess_words and img_w > 0 and img_h > 0:
|
||||
service = GridDetectionService()
|
||||
regions = service.convert_tesseract_regions(tess_words, img_w, img_h)
|
||||
|
||||
if regions:
|
||||
grid_result = service.detect_grid(regions)
|
||||
grid_dict = grid_result.to_dict()
|
||||
|
||||
# Merge LLM text if available (better quality than Tesseract text)
|
||||
# The LLM vocab was stored during compare-ocr
|
||||
grid_dict["source"] = "tesseract+grid_service"
|
||||
grid_dict["word_count"] = len(tess_words)
|
||||
|
||||
logger.info(f"Tesseract grid: {grid_result.rows}x{grid_result.columns}, "
|
||||
f"{grid_result.stats.get('recognized', 0)} recognized")
|
||||
|
||||
return {"success": True, "grid": grid_dict}
|
||||
|
||||
logger.info("Tesseract data insufficient, falling back to LLM")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Tesseract grid analysis failed, falling back to LLM: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
# --- Strategy 2: Fall back to Vision LLM ---
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
grid_prompt = """Analyze this textbook page image. It contains a vocabulary table/grid.
|
||||
|
||||
Your task: Identify the TABLE STRUCTURE and extract each cell's content.
|
||||
|
||||
Return a JSON object with this EXACT structure:
|
||||
{
|
||||
"rows": <number of rows>,
|
||||
"columns": <number of columns>,
|
||||
"column_types": ["english", "german", "example"],
|
||||
"entries": [
|
||||
{
|
||||
"row": 0,
|
||||
"col": 0,
|
||||
"text": "the word or phrase in this cell",
|
||||
"column_type": "english",
|
||||
"confidence": 0.95
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- row and col are 0-indexed
|
||||
- column_type is one of: "english", "german", "example", "unknown"
|
||||
- Detect whether each column contains English words, German translations, or example sentences
|
||||
- Include ALL non-empty cells
|
||||
- confidence is 0.0-1.0 based on how clear the text is
|
||||
- If a cell is empty, don't include it
|
||||
- Return ONLY the JSON, no other text"""
|
||||
|
||||
try:
|
||||
import asyncio
|
||||
|
||||
raw_text = ""
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_URL}/api/chat",
|
||||
json={
|
||||
"model": VISION_MODEL,
|
||||
"messages": [{"role": "user", "content": grid_prompt, "images": [image_base64]}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1, "num_predict": 8192},
|
||||
},
|
||||
timeout=300.0,
|
||||
)
|
||||
|
||||
if response.status_code == 500 and attempt < max_retries - 1:
|
||||
wait_time = 10 * (attempt + 1)
|
||||
logger.warning(f"Ollama returned 500, retrying in {wait_time}s (attempt {attempt + 1}/{max_retries})")
|
||||
await asyncio.sleep(wait_time)
|
||||
continue
|
||||
elif response.status_code != 200:
|
||||
error_detail = response.text[:200] if response.text else "Unknown error"
|
||||
return {"success": False, "error": f"Ollama Fehler ({response.status_code}): {error_detail}. Bitte erneut versuchen - evtl. laeuft noch ein anderer OCR-Request."}
|
||||
|
||||
raw_text = response.json().get("message", {}).get("content", "")
|
||||
break
|
||||
|
||||
# Parse JSON from response
|
||||
import re
|
||||
json_match = re.search(r'\{[\s\S]*\}', raw_text)
|
||||
if not json_match:
|
||||
return {"success": False, "error": "Could not parse grid structure from LLM response"}
|
||||
|
||||
grid_raw = json.loads(json_match.group())
|
||||
|
||||
num_rows = grid_raw.get("rows", 0)
|
||||
num_cols = grid_raw.get("columns", 0)
|
||||
column_types = grid_raw.get("column_types", [])
|
||||
entries = grid_raw.get("entries", [])
|
||||
|
||||
if num_rows == 0 or num_cols == 0:
|
||||
return {"success": False, "error": "No grid structure detected"}
|
||||
|
||||
# Ensure column_types has the right length
|
||||
while len(column_types) < num_cols:
|
||||
column_types.append("unknown")
|
||||
|
||||
# Build cell grid with percentage-based coordinates
|
||||
row_height = 100.0 / num_rows
|
||||
col_width = 100.0 / num_cols
|
||||
|
||||
# Track which cells have content
|
||||
cell_map = {}
|
||||
for entry in entries:
|
||||
r = entry.get("row", 0)
|
||||
c = entry.get("col", 0)
|
||||
cell_map[(r, c)] = entry
|
||||
|
||||
cells = []
|
||||
recognized_count = 0
|
||||
empty_count = 0
|
||||
problematic_count = 0
|
||||
|
||||
for r in range(num_rows):
|
||||
row_cells = []
|
||||
for c in range(num_cols):
|
||||
x = c * col_width
|
||||
y = r * row_height
|
||||
|
||||
if (r, c) in cell_map:
|
||||
entry = cell_map[(r, c)]
|
||||
text = entry.get("text", "").strip()
|
||||
conf = entry.get("confidence", 0.8)
|
||||
col_type = entry.get("column_type", column_types[c] if c < len(column_types) else "unknown")
|
||||
|
||||
if text:
|
||||
status = "recognized" if conf >= 0.5 else "problematic"
|
||||
if status == "recognized":
|
||||
recognized_count += 1
|
||||
else:
|
||||
problematic_count += 1
|
||||
else:
|
||||
status = "empty"
|
||||
empty_count += 1
|
||||
else:
|
||||
text = ""
|
||||
conf = 0.0
|
||||
col_type = column_types[c] if c < len(column_types) else "unknown"
|
||||
status = "empty"
|
||||
empty_count += 1
|
||||
|
||||
row_cells.append({
|
||||
"row": r,
|
||||
"col": c,
|
||||
"x": round(x, 2),
|
||||
"y": round(y, 2),
|
||||
"width": round(col_width, 2),
|
||||
"height": round(row_height, 2),
|
||||
"text": text,
|
||||
"confidence": conf,
|
||||
"status": status,
|
||||
"column_type": col_type,
|
||||
})
|
||||
cells.append(row_cells)
|
||||
|
||||
total = num_rows * num_cols
|
||||
coverage = (recognized_count + problematic_count) / max(total, 1)
|
||||
|
||||
# Column and row boundaries as percentages
|
||||
col_boundaries = [round(c * col_width, 2) for c in range(num_cols + 1)]
|
||||
row_boundaries = [round(r * row_height, 2) for r in range(num_rows + 1)]
|
||||
|
||||
grid_data = {
|
||||
"rows": num_rows,
|
||||
"columns": num_cols,
|
||||
"cells": cells,
|
||||
"column_types": column_types,
|
||||
"column_boundaries": col_boundaries,
|
||||
"row_boundaries": row_boundaries,
|
||||
"deskew_angle": 0.0,
|
||||
"source": "vision_llm",
|
||||
"stats": {
|
||||
"recognized": recognized_count,
|
||||
"problematic": problematic_count,
|
||||
"empty": empty_count,
|
||||
"manual": 0,
|
||||
"total": total,
|
||||
"coverage": round(coverage, 3),
|
||||
},
|
||||
}
|
||||
|
||||
return {"success": True, "grid": grid_data}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error("Grid analysis timed out")
|
||||
return {"success": False, "error": "Grid-Analyse Timeout (Ollama zu langsam)"}
|
||||
except Exception as e:
|
||||
logger.error(f"Grid analysis failed: {e}")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
return {"success": False, "error": f"Grid-Analyse fehlgeschlagen: {str(e)}"}
|
||||
325
klausur-service/backend/vocab_worksheet_extraction.py
Normal file
325
klausur-service/backend/vocab_worksheet_extraction.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""Vocabulary extraction from images using Vision LLM and hybrid OCR+LLM.
|
||||
|
||||
Contains:
|
||||
- VOCAB_EXTRACTION_PROMPT: Prompt template for Vision LLM extraction
|
||||
- extract_vocabulary_from_image(): Core extraction (hybrid or Vision LLM)
|
||||
- _get_demo_vocabulary(): Demo data for testing
|
||||
- parse_vocabulary_json(): Robust JSON parsing with 4-strategy fallback
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
import httpx
|
||||
|
||||
from vocab_worksheet_models import VocabularyEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Ollama Configuration
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
VISION_MODEL = os.getenv("OLLAMA_VISION_MODEL", "qwen2.5vl:32b")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Vision LLM Vocabulary Extraction
|
||||
# =============================================================================
|
||||
|
||||
VOCAB_EXTRACTION_PROMPT = """Analysiere dieses Bild einer Vokabelliste aus einem Schulbuch.
|
||||
|
||||
AUFGABE: Extrahiere alle Vokabeleintraege in folgendem JSON-Format:
|
||||
|
||||
{
|
||||
"vocabulary": [
|
||||
{
|
||||
"english": "to improve",
|
||||
"german": "verbessern",
|
||||
"example": "I want to improve my English."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
REGELN:
|
||||
1. Erkenne das typische 3-Spalten-Layout: Englisch | Deutsch | Beispielsatz
|
||||
2. Behalte die exakte Schreibweise bei
|
||||
3. Bei fehlenden Beispielsaetzen: "example": null
|
||||
4. Ignoriere Seitenzahlen, Ueberschriften, Kapitelnummern
|
||||
5. Gib NUR valides JSON zurueck, keine Erklaerungen
|
||||
6. Wenn Wortarten angegeben sind (n, v, adj), extrahiere sie als "word_type"
|
||||
|
||||
Beispiel-Output:
|
||||
{
|
||||
"vocabulary": [
|
||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "Her achievements were impressive.", "word_type": "n"},
|
||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals.", "word_type": "v"}
|
||||
]
|
||||
}"""
|
||||
|
||||
|
||||
async def extract_vocabulary_from_image(
|
||||
image_data: bytes,
|
||||
filename: str,
|
||||
page_number: int = 0,
|
||||
use_hybrid: bool = False # DISABLED: PaddleOCR crashes on ARM64 Mac Mini
|
||||
) -> tuple[List[VocabularyEntry], float, str]:
|
||||
"""
|
||||
Extract vocabulary from an image using hybrid OCR+LLM or Vision LLM (default).
|
||||
|
||||
Args:
|
||||
image_data: Image bytes
|
||||
filename: Original filename for logging
|
||||
page_number: 0-indexed page number for error messages
|
||||
use_hybrid: If True, use PaddleOCR + LLM (faster, more accurate for printed text)
|
||||
If False, use Vision LLM (slower, better for complex layouts)
|
||||
|
||||
Returns:
|
||||
Tuple of (vocabulary_entries, confidence, error_message)
|
||||
error_message is empty string on success
|
||||
"""
|
||||
|
||||
# ==========================================================================
|
||||
# HYBRID APPROACH (Default): PaddleOCR + LLM Gateway
|
||||
# ==========================================================================
|
||||
if use_hybrid:
|
||||
try:
|
||||
from hybrid_vocab_extractor import extract_vocabulary_hybrid
|
||||
logger.info(f"Using HYBRID extraction for {filename} (PaddleOCR + LLM)")
|
||||
|
||||
vocab_dicts, confidence, error = await extract_vocabulary_hybrid(image_data, page_number)
|
||||
|
||||
if error:
|
||||
logger.warning(f"Hybrid extraction had issues: {error}")
|
||||
# Fall through to Vision LLM fallback
|
||||
elif vocab_dicts:
|
||||
# Convert dicts to VocabularyEntry objects
|
||||
vocabulary = [
|
||||
VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=v.get("english", ""),
|
||||
german=v.get("german", ""),
|
||||
example_sentence=v.get("example"),
|
||||
source_page=page_number + 1
|
||||
)
|
||||
for v in vocab_dicts
|
||||
if v.get("english") and v.get("german")
|
||||
]
|
||||
logger.info(f"Hybrid extraction: {len(vocabulary)} entries from {filename}")
|
||||
return vocabulary, confidence, ""
|
||||
|
||||
except ImportError as e:
|
||||
logger.warning(f"Hybrid extractor not available: {e}. Falling back to Vision LLM.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Hybrid extraction failed: {e}. Falling back to Vision LLM.")
|
||||
import traceback
|
||||
logger.debug(traceback.format_exc())
|
||||
|
||||
# ==========================================================================
|
||||
# FALLBACK: Vision LLM (Ollama llama3.2-vision)
|
||||
# ==========================================================================
|
||||
logger.info(f"Using VISION LLM extraction for {filename}")
|
||||
|
||||
try:
|
||||
# First check if Ollama is available
|
||||
async with httpx.AsyncClient(timeout=10.0) as check_client:
|
||||
try:
|
||||
health_response = await check_client.get(f"{OLLAMA_URL}/api/tags")
|
||||
if health_response.status_code != 200:
|
||||
logger.error(f"Ollama not available at {OLLAMA_URL}")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Ollama nicht verfuegbar"
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama health check failed: {e}")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Verbindung zu Ollama fehlgeschlagen"
|
||||
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
payload = {
|
||||
"model": VISION_MODEL,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": VOCAB_EXTRACTION_PROMPT,
|
||||
"images": [image_base64]
|
||||
}
|
||||
],
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": 0.1,
|
||||
"num_predict": 4096,
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(f"Extracting vocabulary from {filename} ({len(image_data)} bytes) using {VISION_MODEL}")
|
||||
|
||||
# Increased timeout for Vision models (they can be slow)
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
response = await client.post(
|
||||
f"{OLLAMA_URL}/api/chat",
|
||||
json=payload,
|
||||
timeout=300.0 # 5 minutes per page
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
extracted_text = data.get("message", {}).get("content", "")
|
||||
|
||||
logger.info(f"Ollama response received: {len(extracted_text)} chars")
|
||||
|
||||
# Parse JSON from response
|
||||
vocabulary = parse_vocabulary_json(extracted_text)
|
||||
|
||||
# Set source_page for each entry
|
||||
for v in vocabulary:
|
||||
v.source_page = page_number + 1
|
||||
|
||||
# Estimate confidence
|
||||
confidence = 0.85 if len(vocabulary) > 0 else 0.1
|
||||
|
||||
logger.info(f"Vision LLM extracted {len(vocabulary)} vocabulary entries from {filename}")
|
||||
|
||||
return vocabulary, confidence, ""
|
||||
|
||||
except httpx.TimeoutException:
|
||||
logger.error(f"Ollama request timed out for {filename} (model: {VISION_MODEL})")
|
||||
return [], 0.0, f"Seite {page_number + 1}: Timeout - Verarbeitung dauerte zu lange"
|
||||
except Exception as e:
|
||||
logger.error(f"Vocabulary extraction failed for {filename}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return [], 0.0, f"Seite {page_number + 1}: Fehler - {str(e)[:50]}"
|
||||
|
||||
|
||||
def _get_demo_vocabulary() -> List[VocabularyEntry]:
|
||||
"""Return demo vocabulary for testing when Vision LLM is not available."""
|
||||
demo_entries = [
|
||||
{"english": "to achieve", "german": "erreichen, erzielen", "example": "She achieved her goals."},
|
||||
{"english": "achievement", "german": "Leistung, Errungenschaft", "example": "That was a great achievement."},
|
||||
{"english": "improve", "german": "verbessern", "example": "I want to improve my English."},
|
||||
{"english": "improvement", "german": "Verbesserung", "example": "There has been a lot of improvement."},
|
||||
{"english": "success", "german": "Erfolg", "example": "The project was a success."},
|
||||
{"english": "successful", "german": "erfolgreich", "example": "She is a successful businesswoman."},
|
||||
{"english": "fail", "german": "scheitern, durchfallen", "example": "Don't be afraid to fail."},
|
||||
{"english": "failure", "german": "Misserfolg, Versagen", "example": "Failure is part of learning."},
|
||||
]
|
||||
return [
|
||||
VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=e["english"],
|
||||
german=e["german"],
|
||||
example_sentence=e.get("example"),
|
||||
)
|
||||
for e in demo_entries
|
||||
]
|
||||
|
||||
|
||||
def parse_vocabulary_json(text: str) -> List[VocabularyEntry]:
|
||||
"""Parse vocabulary JSON from LLM response with robust error handling."""
|
||||
|
||||
def clean_json_string(s: str) -> str:
|
||||
"""Clean a JSON string by removing control characters and fixing common issues."""
|
||||
# Remove control characters except newlines and tabs
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
# Replace unescaped newlines within strings with space
|
||||
# This is a simplistic approach - replace actual newlines with escaped ones
|
||||
s = s.replace('\n', '\\n').replace('\r', '\\r').replace('\t', '\\t')
|
||||
return s
|
||||
|
||||
def try_parse_json(json_str: str) -> dict:
|
||||
"""Try multiple strategies to parse JSON."""
|
||||
# Strategy 1: Direct parse
|
||||
try:
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 2: Clean and parse
|
||||
try:
|
||||
cleaned = clean_json_string(json_str)
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Strategy 3: Try to fix common issues
|
||||
try:
|
||||
# Remove trailing commas before } or ]
|
||||
fixed = re.sub(r',(\s*[}\]])', r'\1', json_str)
|
||||
# Fix unquoted keys
|
||||
fixed = re.sub(r'(\{|\,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', fixed)
|
||||
return json.loads(fixed)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
try:
|
||||
# Find JSON in response (may have extra text)
|
||||
start = text.find('{')
|
||||
end = text.rfind('}') + 1
|
||||
|
||||
if start == -1 or end == 0:
|
||||
logger.warning("No JSON found in response")
|
||||
return []
|
||||
|
||||
json_str = text[start:end]
|
||||
data = try_parse_json(json_str)
|
||||
|
||||
if data is None:
|
||||
# Strategy 4: Extract vocabulary entries using regex as fallback
|
||||
logger.warning("JSON parsing failed, trying regex extraction")
|
||||
vocabulary = []
|
||||
# Match patterns like {"english": "...", "german": "...", ...}
|
||||
pattern = r'\{\s*"english"\s*:\s*"([^"]*?)"\s*,\s*"german"\s*:\s*"([^"]*?)"(?:\s*,\s*"example"\s*:\s*(?:"([^"]*?)"|null))?'
|
||||
matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
|
||||
|
||||
for match in matches:
|
||||
english = match[0].strip() if match[0] else ""
|
||||
german = match[1].strip() if match[1] else ""
|
||||
example = match[2].strip() if len(match) > 2 and match[2] else None
|
||||
|
||||
if english and german:
|
||||
vocab_entry = VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=english,
|
||||
german=german,
|
||||
example_sentence=example,
|
||||
)
|
||||
vocabulary.append(vocab_entry)
|
||||
|
||||
if vocabulary:
|
||||
logger.info(f"Regex extraction found {len(vocabulary)} entries")
|
||||
return vocabulary
|
||||
|
||||
# Normal JSON parsing succeeded
|
||||
vocabulary = []
|
||||
for i, entry in enumerate(data.get("vocabulary", [])):
|
||||
english = entry.get("english", "").strip()
|
||||
german = entry.get("german", "").strip()
|
||||
|
||||
# Skip entries that look like hallucinations (very long or containing unusual patterns)
|
||||
if len(english) > 100 or len(german) > 200:
|
||||
logger.warning(f"Skipping suspicious entry: {english[:50]}...")
|
||||
continue
|
||||
|
||||
if not english or not german:
|
||||
continue
|
||||
|
||||
vocab_entry = VocabularyEntry(
|
||||
id=str(uuid.uuid4()),
|
||||
english=english,
|
||||
german=german,
|
||||
example_sentence=entry.get("example"),
|
||||
word_type=entry.get("word_type"),
|
||||
)
|
||||
vocabulary.append(vocab_entry)
|
||||
|
||||
return vocabulary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse vocabulary JSON: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return []
|
||||
260
klausur-service/backend/vocab_worksheet_generation.py
Normal file
260
klausur-service/backend/vocab_worksheet_generation.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
Vocabulary Worksheet Generation — HTML/PDF generation and PDF utilities.
|
||||
|
||||
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
||||
|
||||
Functions:
|
||||
- generate_worksheet_html(): Build HTML for various worksheet types
|
||||
- generate_worksheet_pdf(): Convert HTML to PDF via WeasyPrint
|
||||
- get_pdf_page_count(): Count pages in a PDF (PyMuPDF)
|
||||
- convert_pdf_page_to_image(): Render single PDF page to PNG
|
||||
- convert_pdf_to_images(): Render multiple PDF pages to PNG
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from vocab_worksheet_models import VocabularyEntry, WorksheetType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional dependency: WeasyPrint
|
||||
try:
|
||||
from weasyprint import HTML as _WeasyHTML
|
||||
WEASYPRINT_AVAILABLE = True
|
||||
except (ImportError, OSError):
|
||||
WEASYPRINT_AVAILABLE = False
|
||||
logger.warning("WeasyPrint not available")
|
||||
|
||||
# Optional dependency: PyMuPDF
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
FITZ_AVAILABLE = True
|
||||
except ImportError:
|
||||
FITZ_AVAILABLE = False
|
||||
logger.warning("PyMuPDF (fitz) not available")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Worksheet HTML Generation
|
||||
# =============================================================================
|
||||
|
||||
def generate_worksheet_html(
|
||||
vocabulary: List[VocabularyEntry],
|
||||
worksheet_type: WorksheetType,
|
||||
title: str,
|
||||
show_solutions: bool = False,
|
||||
repetitions: int = 3,
|
||||
line_height: str = "normal"
|
||||
) -> str:
|
||||
"""Generate HTML for a worksheet."""
|
||||
|
||||
# Line height CSS
|
||||
line_heights = {
|
||||
"normal": "2.5em",
|
||||
"large": "3.5em",
|
||||
"extra-large": "4.5em"
|
||||
}
|
||||
lh = line_heights.get(line_height, "2.5em")
|
||||
|
||||
html = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {{ size: A4; margin: 2cm; }}
|
||||
body {{ font-family: 'Segoe UI', Arial, sans-serif; font-size: 14px; }}
|
||||
h1 {{ font-size: 24px; margin-bottom: 10px; }}
|
||||
.meta {{ color: #666; margin-bottom: 20px; }}
|
||||
.name-line {{ margin-bottom: 30px; }}
|
||||
.vocab-table {{ width: 100%; border-collapse: collapse; }}
|
||||
.vocab-table td {{ padding: 8px; border-bottom: 1px solid #ddd; line-height: {lh}; }}
|
||||
.vocab-word {{ width: 40%; font-weight: 500; }}
|
||||
.vocab-blank {{ width: 60%; border-bottom: 2px dotted #999; }}
|
||||
.vocab-answer {{ width: 60%; color: #2563eb; }}
|
||||
.gap {{ border-bottom: 2px solid #333; min-width: 100px; display: inline-block; }}
|
||||
.hint {{ color: #666; font-style: italic; font-size: 12px; }}
|
||||
.section {{ margin-top: 30px; }}
|
||||
.section-title {{ font-size: 16px; font-weight: 600; margin-bottom: 15px; color: #374151; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{title}</h1>
|
||||
<div class="name-line">Name: _________________________ Datum: _____________</div>
|
||||
"""
|
||||
|
||||
if worksheet_type == WorksheetType.EN_TO_DE:
|
||||
html += '<div class="section"><div class="section-title">Uebersetze ins Deutsche:</div>'
|
||||
html += '<table class="vocab-table">'
|
||||
for entry in vocabulary:
|
||||
if show_solutions:
|
||||
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-answer">{entry.german}</td></tr>'
|
||||
else:
|
||||
html += f'<tr><td class="vocab-word">{entry.english}</td><td class="vocab-blank"></td></tr>'
|
||||
html += '</table></div>'
|
||||
|
||||
elif worksheet_type == WorksheetType.DE_TO_EN:
|
||||
html += '<div class="section"><div class="section-title">Uebersetze ins Englische:</div>'
|
||||
html += '<table class="vocab-table">'
|
||||
for entry in vocabulary:
|
||||
if show_solutions:
|
||||
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-answer">{entry.english}</td></tr>'
|
||||
else:
|
||||
html += f'<tr><td class="vocab-word">{entry.german}</td><td class="vocab-blank"></td></tr>'
|
||||
html += '</table></div>'
|
||||
|
||||
elif worksheet_type == WorksheetType.COPY_PRACTICE:
|
||||
html += '<div class="section"><div class="section-title">Schreibe jedes Wort mehrmals:</div>'
|
||||
html += '<table class="vocab-table">'
|
||||
for entry in vocabulary:
|
||||
html += f'<tr><td class="vocab-word">{entry.english}</td>'
|
||||
html += '<td class="vocab-blank">'
|
||||
if show_solutions:
|
||||
html += f' {entry.english} ' * repetitions
|
||||
html += '</td></tr>'
|
||||
html += '</table></div>'
|
||||
|
||||
elif worksheet_type == WorksheetType.GAP_FILL:
|
||||
entries_with_examples = [e for e in vocabulary if e.example_sentence]
|
||||
if entries_with_examples:
|
||||
html += '<div class="section"><div class="section-title">Fuege das passende Wort ein:</div>'
|
||||
for i, entry in enumerate(entries_with_examples, 1):
|
||||
# Create gap sentence by removing the English word
|
||||
gap_sentence = entry.example_sentence
|
||||
for word in entry.english.split():
|
||||
if word.lower() in gap_sentence.lower():
|
||||
gap_sentence = gap_sentence.replace(word, '<span class="gap"></span>')
|
||||
gap_sentence = gap_sentence.replace(word.capitalize(), '<span class="gap"></span>')
|
||||
gap_sentence = gap_sentence.replace(word.lower(), '<span class="gap"></span>')
|
||||
break
|
||||
|
||||
html += f'<p>{i}. {gap_sentence}</p>'
|
||||
if show_solutions:
|
||||
html += f'<p class="hint">Loesung: {entry.english}</p>'
|
||||
else:
|
||||
html += f'<p class="hint">({entry.german})</p>'
|
||||
html += '</div>'
|
||||
|
||||
html += '</body></html>'
|
||||
return html
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Worksheet PDF Generation
|
||||
# =============================================================================
|
||||
|
||||
async def generate_worksheet_pdf(html: str) -> bytes:
|
||||
"""Generate PDF from HTML using WeasyPrint."""
|
||||
try:
|
||||
from weasyprint import HTML
|
||||
pdf_bytes = HTML(string=html).write_pdf()
|
||||
return pdf_bytes
|
||||
except ImportError:
|
||||
logger.warning("WeasyPrint not available, returning HTML")
|
||||
return html.encode('utf-8')
|
||||
except Exception as e:
|
||||
logger.error(f"PDF generation failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PDF Utilities (PyMuPDF)
|
||||
# =============================================================================
|
||||
|
||||
def get_pdf_page_count(pdf_data: bytes) -> int:
|
||||
"""Get the number of pages in a PDF."""
|
||||
try:
|
||||
import fitz
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
count = pdf_document.page_count
|
||||
pdf_document.close()
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get PDF page count: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
async def convert_pdf_page_to_image(pdf_data: bytes, page_number: int = 0, thumbnail: bool = False) -> bytes:
|
||||
"""Convert a specific page of PDF to PNG image using PyMuPDF.
|
||||
|
||||
Args:
|
||||
pdf_data: PDF file as bytes
|
||||
page_number: 0-indexed page number
|
||||
thumbnail: If True, return a smaller thumbnail image
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
|
||||
if pdf_document.page_count == 0:
|
||||
raise ValueError("PDF has no pages")
|
||||
|
||||
if page_number >= pdf_document.page_count:
|
||||
raise ValueError(f"Page {page_number} does not exist (PDF has {pdf_document.page_count} pages)")
|
||||
|
||||
page = pdf_document[page_number]
|
||||
|
||||
# Render page to image
|
||||
# For thumbnails: lower resolution, for OCR: higher resolution
|
||||
zoom = 0.5 if thumbnail else 2.0
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
|
||||
logger.info(f"Converted PDF page {page_number} to PNG: {len(png_data)} bytes (thumbnail={thumbnail})")
|
||||
return png_data
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF (fitz) not installed")
|
||||
raise HTTPException(status_code=500, detail="PDF conversion not available - PyMuPDF not installed")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF conversion failed: {e}")
|
||||
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
||||
|
||||
|
||||
async def convert_pdf_to_images(pdf_data: bytes, pages: List[int] = None) -> List[bytes]:
|
||||
"""Convert multiple pages of PDF to PNG images.
|
||||
|
||||
Args:
|
||||
pdf_data: PDF file as bytes
|
||||
pages: List of 0-indexed page numbers to convert. If None, convert all pages.
|
||||
"""
|
||||
try:
|
||||
import fitz
|
||||
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
|
||||
if pdf_document.page_count == 0:
|
||||
raise ValueError("PDF has no pages")
|
||||
|
||||
# If no pages specified, convert all
|
||||
if pages is None:
|
||||
pages = list(range(pdf_document.page_count))
|
||||
|
||||
images = []
|
||||
zoom = 2.0
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
|
||||
for page_num in pages:
|
||||
if page_num < pdf_document.page_count:
|
||||
page = pdf_document[page_num]
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
images.append(pix.tobytes("png"))
|
||||
|
||||
pdf_document.close()
|
||||
logger.info(f"Converted {len(images)} PDF pages to images")
|
||||
return images
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF (fitz) not installed")
|
||||
raise HTTPException(status_code=500, detail="PDF conversion not available")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF conversion failed: {e}")
|
||||
raise HTTPException(status_code=400, detail=f"PDF conversion failed: {str(e)}")
|
||||
86
klausur-service/backend/vocab_worksheet_models.py
Normal file
86
klausur-service/backend/vocab_worksheet_models.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Pydantic models and enums for the Vocab Worksheet API."""
|
||||
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enums
|
||||
# =============================================================================
|
||||
|
||||
class WorksheetType(str, Enum):
|
||||
EN_TO_DE = "en_to_de" # English -> German translation
|
||||
DE_TO_EN = "de_to_en" # German -> English translation
|
||||
COPY_PRACTICE = "copy" # Write word multiple times
|
||||
GAP_FILL = "gap_fill" # Fill in the blanks
|
||||
COMBINED = "combined" # All types combined
|
||||
|
||||
|
||||
class SessionStatus(str, Enum):
|
||||
PENDING = "pending" # Session created, no upload yet
|
||||
PROCESSING = "processing" # OCR in progress
|
||||
EXTRACTED = "extracted" # Vocabulary extracted, ready to edit
|
||||
COMPLETED = "completed" # Worksheet generated
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pydantic Models
|
||||
# =============================================================================
|
||||
|
||||
class VocabularyEntry(BaseModel):
|
||||
id: str
|
||||
english: str
|
||||
german: str
|
||||
example_sentence: Optional[str] = None
|
||||
example_sentence_gap: Optional[str] = None # With ___ for gap-fill
|
||||
word_type: Optional[str] = None # noun, verb, adjective, etc.
|
||||
source_page: Optional[int] = None # Page number where entry was found (1-indexed)
|
||||
|
||||
|
||||
class SessionCreate(BaseModel):
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
source_language: str = "en" # Source language (default English)
|
||||
target_language: str = "de" # Target language (default German)
|
||||
|
||||
|
||||
class SessionResponse(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
description: Optional[str]
|
||||
source_language: str
|
||||
target_language: str
|
||||
status: str
|
||||
vocabulary_count: int
|
||||
image_path: Optional[str]
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class VocabularyResponse(BaseModel):
|
||||
session_id: str
|
||||
vocabulary: List[VocabularyEntry]
|
||||
extraction_confidence: Optional[float]
|
||||
|
||||
|
||||
class VocabularyUpdate(BaseModel):
|
||||
vocabulary: List[VocabularyEntry]
|
||||
|
||||
|
||||
class WorksheetGenerateRequest(BaseModel):
|
||||
worksheet_types: List[WorksheetType]
|
||||
title: Optional[str] = None
|
||||
include_solutions: bool = True
|
||||
repetitions: int = 3 # For copy practice
|
||||
line_height: str = "normal" # normal, large, extra-large
|
||||
|
||||
|
||||
class WorksheetResponse(BaseModel):
|
||||
id: str
|
||||
session_id: str
|
||||
worksheet_types: List[str]
|
||||
pdf_path: str
|
||||
solution_path: Optional[str]
|
||||
generated_at: datetime
|
||||
481
klausur-service/backend/vocab_worksheet_ocr.py
Normal file
481
klausur-service/backend/vocab_worksheet_ocr.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
Vocab Worksheet OCR Pipeline — full Kombi OCR pipeline for a single page.
|
||||
|
||||
Extracted from vocab_worksheet_api.py to keep file sizes manageable.
|
||||
|
||||
Pipeline steps:
|
||||
orientation → deskew → dewarp → crop → scan-quality → enhance →
|
||||
dual-engine OCR (RapidOCR + Tesseract) → merge → grid-build →
|
||||
vocab extraction → row merging
|
||||
"""
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional heavy dependencies (not available in every environment)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
np = None # type: ignore[assignment]
|
||||
logger.warning("cv2 / numpy not available — OCR pipeline disabled")
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
Image = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except ImportError:
|
||||
pytesseract = None # type: ignore[assignment]
|
||||
|
||||
# CV pipeline helpers
|
||||
try:
|
||||
from cv_vocab_pipeline import (
|
||||
deskew_two_pass,
|
||||
dewarp_image,
|
||||
detect_and_fix_orientation,
|
||||
_cells_to_vocab_entries,
|
||||
_fix_phonetic_brackets,
|
||||
)
|
||||
except ImportError:
|
||||
deskew_two_pass = None # type: ignore[assignment]
|
||||
dewarp_image = None # type: ignore[assignment]
|
||||
detect_and_fix_orientation = None # type: ignore[assignment]
|
||||
_cells_to_vocab_entries = None # type: ignore[assignment]
|
||||
_fix_phonetic_brackets = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from cv_cell_grid import (
|
||||
_merge_wrapped_rows,
|
||||
_merge_phonetic_continuation_rows,
|
||||
_merge_continuation_rows,
|
||||
)
|
||||
except ImportError:
|
||||
_merge_wrapped_rows = None # type: ignore[assignment]
|
||||
_merge_phonetic_continuation_rows = None # type: ignore[assignment]
|
||||
_merge_continuation_rows = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from cv_ocr_engines import ocr_region_rapid
|
||||
except ImportError:
|
||||
ocr_region_rapid = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from cv_vocab_types import PageRegion
|
||||
except ImportError:
|
||||
PageRegion = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from ocr_pipeline_ocr_merge import (
|
||||
_split_paddle_multi_words,
|
||||
_merge_paddle_tesseract,
|
||||
_deduplicate_words,
|
||||
)
|
||||
except ImportError:
|
||||
_split_paddle_multi_words = None # type: ignore[assignment]
|
||||
_merge_paddle_tesseract = None # type: ignore[assignment]
|
||||
_deduplicate_words = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from cv_words_first import build_grid_from_words
|
||||
except ImportError:
|
||||
build_grid_from_words = None # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
from ocr_pipeline_session_store import (
|
||||
create_session_db as create_pipeline_session_db,
|
||||
update_session_db as update_pipeline_session_db,
|
||||
)
|
||||
except ImportError:
|
||||
create_pipeline_session_db = None # type: ignore[assignment]
|
||||
update_pipeline_session_db = None # type: ignore[assignment]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main pipeline function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _run_ocr_pipeline_for_page(
|
||||
img_bgr: "np.ndarray",
|
||||
page_number: int,
|
||||
vocab_session_id: str,
|
||||
*,
|
||||
ipa_mode: str = "none",
|
||||
syllable_mode: str = "none",
|
||||
enable_enhance: bool = True,
|
||||
max_columns: Optional[int] = 3,
|
||||
override_min_conf: Optional[int] = None,
|
||||
) -> tuple:
|
||||
"""Run the full Kombi OCR pipeline on a single page and return vocab entries.
|
||||
|
||||
Uses the same pipeline as the admin OCR Kombi pipeline:
|
||||
orientation → deskew → dewarp → crop → dual-engine OCR → grid-build
|
||||
(with pipe-autocorrect, word-gap merge, dictionary detection, etc.)
|
||||
|
||||
Args:
|
||||
img_bgr: BGR numpy array.
|
||||
page_number: 0-indexed page number.
|
||||
vocab_session_id: Vocab session ID for logging.
|
||||
ipa_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||
syllable_mode: "none" (default for worksheets), "auto", "all", "en", "de".
|
||||
|
||||
Returns (entries, rotation_deg) where entries is a list of dicts and
|
||||
rotation_deg is the orientation correction applied (0, 90, 180, 270).
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
t_total = _time.time()
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
logger.info(f"Kombi Pipeline page {page_number + 1}: image {img_w}x{img_h}")
|
||||
|
||||
# 1. Orientation detection (fix upside-down scans)
|
||||
t0 = _time.time()
|
||||
img_bgr, rotation = detect_and_fix_orientation(img_bgr)
|
||||
if rotation:
|
||||
img_h, img_w = img_bgr.shape[:2]
|
||||
logger.info(f" orientation: rotated {rotation}° ({_time.time() - t0:.1f}s)")
|
||||
else:
|
||||
logger.info(f" orientation: OK ({_time.time() - t0:.1f}s)")
|
||||
|
||||
# 2. Create pipeline session in DB (visible in admin Kombi UI)
|
||||
pipeline_session_id = str(uuid.uuid4())
|
||||
try:
|
||||
_, png_buf = cv2.imencode(".png", img_bgr)
|
||||
original_png = png_buf.tobytes()
|
||||
await create_pipeline_session_db(
|
||||
pipeline_session_id,
|
||||
name=f"vocab-ws-{vocab_session_id[:8]}-p{page_number + 1}",
|
||||
filename=f"page_{page_number + 1}.png",
|
||||
original_png=original_png,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not create pipeline session in DB: {e}")
|
||||
|
||||
# 3. Three-pass deskew
|
||||
t0 = _time.time()
|
||||
deskewed_bgr, angle_applied, deskew_debug = deskew_two_pass(img_bgr.copy())
|
||||
logger.info(f" deskew: angle={angle_applied:.2f} ({_time.time() - t0:.1f}s)")
|
||||
|
||||
# 4. Dewarp
|
||||
t0 = _time.time()
|
||||
dewarped_bgr, dewarp_info = dewarp_image(deskewed_bgr)
|
||||
logger.info(f" dewarp: shear={dewarp_info['shear_degrees']:.3f} ({_time.time() - t0:.1f}s)")
|
||||
|
||||
# 5. Content crop (removes scanner borders, gutter shadows)
|
||||
t0 = _time.time()
|
||||
try:
|
||||
from page_crop import detect_and_crop_page
|
||||
cropped_bgr, crop_result = detect_and_crop_page(dewarped_bgr)
|
||||
if crop_result.get("crop_applied"):
|
||||
dewarped_bgr = cropped_bgr
|
||||
logger.info(f" crop: applied ({_time.time() - t0:.1f}s)")
|
||||
else:
|
||||
logger.info(f" crop: skipped ({_time.time() - t0:.1f}s)")
|
||||
except Exception as e:
|
||||
logger.warning(f" crop: failed ({e}), continuing with uncropped image")
|
||||
|
||||
# 5b. Scan quality assessment
|
||||
scan_quality_report = None
|
||||
try:
|
||||
from scan_quality import score_scan_quality
|
||||
scan_quality_report = score_scan_quality(dewarped_bgr)
|
||||
except Exception as e:
|
||||
logger.warning(f" scan quality: failed ({e})")
|
||||
|
||||
if override_min_conf:
|
||||
min_ocr_conf = override_min_conf
|
||||
else:
|
||||
min_ocr_conf = scan_quality_report.recommended_min_conf if scan_quality_report else 40
|
||||
|
||||
# 5c. Image enhancement for degraded scans
|
||||
is_degraded = scan_quality_report.is_degraded if scan_quality_report else False
|
||||
if is_degraded and enable_enhance:
|
||||
try:
|
||||
from ocr_image_enhance import enhance_for_ocr
|
||||
dewarped_bgr = enhance_for_ocr(dewarped_bgr, is_degraded=True)
|
||||
logger.info(" enhancement: applied (degraded scan)")
|
||||
except Exception as e:
|
||||
logger.warning(f" enhancement: failed ({e})")
|
||||
|
||||
# 6. Dual-engine OCR (RapidOCR + Tesseract → merge)
|
||||
t0 = _time.time()
|
||||
img_h, img_w = dewarped_bgr.shape[:2]
|
||||
|
||||
# RapidOCR (local ONNX)
|
||||
try:
|
||||
from cv_ocr_engines import ocr_region_rapid
|
||||
from cv_vocab_types import PageRegion
|
||||
full_region = PageRegion(type="full_page", x=0, y=0, width=img_w, height=img_h)
|
||||
rapid_words = ocr_region_rapid(dewarped_bgr, full_region) or []
|
||||
except Exception as e:
|
||||
logger.warning(f" RapidOCR failed: {e}")
|
||||
rapid_words = []
|
||||
|
||||
# Tesseract
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
pil_img = Image.fromarray(cv2.cvtColor(dewarped_bgr, cv2.COLOR_BGR2RGB))
|
||||
data = pytesseract.image_to_data(
|
||||
pil_img, lang="eng+deu", config="--psm 6 --oem 3",
|
||||
output_type=pytesseract.Output.DICT,
|
||||
)
|
||||
tess_words = []
|
||||
for i in range(len(data["text"])):
|
||||
text = str(data["text"][i]).strip()
|
||||
conf_raw = str(data["conf"][i])
|
||||
conf = int(conf_raw) if conf_raw.lstrip("-").isdigit() else -1
|
||||
if not text or conf < min_ocr_conf:
|
||||
continue
|
||||
tess_words.append({
|
||||
"text": text,
|
||||
"left": data["left"][i], "top": data["top"][i],
|
||||
"width": data["width"][i], "height": data["height"][i],
|
||||
"conf": conf,
|
||||
})
|
||||
|
||||
# Merge dual-engine results
|
||||
from ocr_pipeline_ocr_merge import _split_paddle_multi_words, _merge_paddle_tesseract, _deduplicate_words
|
||||
from cv_words_first import build_grid_from_words
|
||||
|
||||
rapid_split = _split_paddle_multi_words(rapid_words) if rapid_words else []
|
||||
if rapid_split or tess_words:
|
||||
merged_words = _merge_paddle_tesseract(rapid_split, tess_words)
|
||||
merged_words = _deduplicate_words(merged_words)
|
||||
else:
|
||||
merged_words = tess_words # fallback to Tesseract only
|
||||
|
||||
# Build initial grid from merged words
|
||||
cells, columns_meta = build_grid_from_words(merged_words, img_w, img_h, max_columns=max_columns)
|
||||
for cell in cells:
|
||||
cell["ocr_engine"] = "rapid_kombi"
|
||||
|
||||
n_rows = len(set(c["row_index"] for c in cells)) if cells else 0
|
||||
n_cols = len(columns_meta)
|
||||
logger.info(f" ocr: rapid={len(rapid_words)}, tess={len(tess_words)}, "
|
||||
f"merged={len(merged_words)}, cells={len(cells)} ({_time.time() - t0:.1f}s)")
|
||||
|
||||
# 7. Save word_result to pipeline session (needed by _build_grid_core)
|
||||
word_result = {
|
||||
"cells": cells,
|
||||
"grid_shape": {"rows": n_rows, "cols": n_cols, "total_cells": len(cells)},
|
||||
"columns_used": columns_meta,
|
||||
"layout": "vocab" if {c.get("type") for c in columns_meta} & {"column_en", "column_de"} else "generic",
|
||||
"image_width": img_w,
|
||||
"image_height": img_h,
|
||||
"duration_seconds": 0,
|
||||
"ocr_engine": "rapid_kombi",
|
||||
"raw_tesseract_words": tess_words,
|
||||
"summary": {
|
||||
"total_cells": len(cells),
|
||||
"non_empty_cells": sum(1 for c in cells if c.get("text")),
|
||||
},
|
||||
}
|
||||
|
||||
# Save images + word_result to pipeline session for admin visibility
|
||||
try:
|
||||
_, dsk_buf = cv2.imencode(".png", deskewed_bgr)
|
||||
_, dwp_buf = cv2.imencode(".png", dewarped_bgr)
|
||||
await update_pipeline_session_db(
|
||||
pipeline_session_id,
|
||||
deskewed_png=dsk_buf.tobytes(),
|
||||
dewarped_png=dwp_buf.tobytes(),
|
||||
cropped_png=cv2.imencode(".png", dewarped_bgr)[1].tobytes(),
|
||||
word_result=word_result,
|
||||
deskew_result={"angle_applied": round(angle_applied, 3)},
|
||||
dewarp_result={"shear_degrees": dewarp_info.get("shear_degrees", 0)},
|
||||
current_step=8,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not update pipeline session: {e}")
|
||||
|
||||
# 8. Run full grid-build (with pipe-autocorrect, word-gap merge, etc.)
|
||||
t0 = _time.time()
|
||||
try:
|
||||
from grid_editor_api import _build_grid_core
|
||||
session_data = {
|
||||
"word_result": word_result,
|
||||
}
|
||||
grid_result = await _build_grid_core(
|
||||
pipeline_session_id, session_data,
|
||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||
)
|
||||
logger.info(f" grid-build: {grid_result.get('summary', {}).get('total_cells', 0)} cells "
|
||||
f"({_time.time() - t0:.1f}s)")
|
||||
|
||||
# Save grid result to pipeline session
|
||||
try:
|
||||
await update_pipeline_session_db(
|
||||
pipeline_session_id,
|
||||
grid_editor_result=grid_result,
|
||||
current_step=11,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" grid-build failed: {e}, falling back to basic grid")
|
||||
grid_result = None
|
||||
|
||||
# 9. Extract vocab entries
|
||||
# Prefer grid-build result (better column detection, more cells) over
|
||||
# the initial build_grid_from_words() which often under-clusters.
|
||||
page_vocabulary = []
|
||||
extraction_source = "none"
|
||||
|
||||
# A) Try grid-build zones first (best quality: 4-column detection, autocorrect)
|
||||
if grid_result and grid_result.get("zones"):
|
||||
for zone in grid_result["zones"]:
|
||||
zone_cols = zone.get("columns", [])
|
||||
zone_cells = zone.get("cells", [])
|
||||
if not zone_cols or not zone_cells:
|
||||
continue
|
||||
|
||||
# Sort columns by x position to determine roles
|
||||
sorted_cols = sorted(zone_cols, key=lambda c: c.get("x_min_px", 0))
|
||||
col_idx_to_pos = {}
|
||||
for pos, col in enumerate(sorted_cols):
|
||||
ci = col.get("col_index", col.get("index", -1))
|
||||
col_idx_to_pos[ci] = pos
|
||||
|
||||
# Skip zones with only 1 column (likely headers/boxes)
|
||||
if len(sorted_cols) < 2:
|
||||
continue
|
||||
|
||||
# Group cells by row
|
||||
rows_map: dict = {}
|
||||
for cell in zone_cells:
|
||||
ri = cell.get("row_index", 0)
|
||||
if ri not in rows_map:
|
||||
rows_map[ri] = {}
|
||||
ci = cell.get("col_index", 0)
|
||||
rows_map[ri][ci] = (cell.get("text") or "").strip()
|
||||
|
||||
n_cols = len(sorted_cols)
|
||||
for ri in sorted(rows_map.keys()):
|
||||
row = rows_map[ri]
|
||||
# Collect texts in column-position order
|
||||
texts = []
|
||||
for col in sorted_cols:
|
||||
ci = col.get("col_index", col.get("index", -1))
|
||||
texts.append(row.get(ci, ""))
|
||||
|
||||
if not any(texts):
|
||||
continue
|
||||
|
||||
# Map by position, skipping narrow first column (page refs/markers)
|
||||
# Heuristic: if first column is very narrow (<15% of zone width),
|
||||
# it's likely a marker/ref column — skip it for vocab
|
||||
first_col_width = sorted_cols[0].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)
|
||||
zone_width = max(1, (sorted_cols[-1].get("x_max_px", 0) - sorted_cols[0].get("x_min_px", 0)))
|
||||
skip_first = first_col_width / zone_width < 0.15 and n_cols >= 3
|
||||
|
||||
data_texts = texts[1:] if skip_first else texts
|
||||
|
||||
entry = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": data_texts[0] if len(data_texts) > 0 else "",
|
||||
"german": data_texts[1] if len(data_texts) > 1 else "",
|
||||
"example_sentence": " ".join(t for t in data_texts[2:] if t) if len(data_texts) > 2 else "",
|
||||
"source_page": page_number + 1,
|
||||
}
|
||||
if entry["english"] or entry["german"]:
|
||||
page_vocabulary.append(entry)
|
||||
|
||||
if page_vocabulary:
|
||||
extraction_source = f"grid-zones ({len(grid_result['zones'])} zones)"
|
||||
|
||||
# B) Fallback: original cells with column classification
|
||||
if not page_vocabulary:
|
||||
col_types = {c.get("type") for c in columns_meta}
|
||||
is_vocab = bool(col_types & {"column_en", "column_de"})
|
||||
|
||||
if is_vocab:
|
||||
entries = _cells_to_vocab_entries(cells, columns_meta)
|
||||
entries = _fix_phonetic_brackets(entries, pronunciation="british")
|
||||
for entry in entries:
|
||||
if not entry.get("english") and not entry.get("german"):
|
||||
continue
|
||||
page_vocabulary.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": entry.get("english", ""),
|
||||
"german": entry.get("german", ""),
|
||||
"example_sentence": entry.get("example", ""),
|
||||
"source_page": page_number + 1,
|
||||
})
|
||||
extraction_source = f"classified ({len(columns_meta)} cols)"
|
||||
else:
|
||||
# Last resort: all cells by position
|
||||
rows_map2: dict = {}
|
||||
for cell in cells:
|
||||
ri = cell.get("row_index", 0)
|
||||
if ri not in rows_map2:
|
||||
rows_map2[ri] = {}
|
||||
ci = cell.get("col_index", 0)
|
||||
rows_map2[ri][ci] = (cell.get("text") or "").strip()
|
||||
all_ci = sorted({ci for r in rows_map2.values() for ci in r.keys()})
|
||||
for ri in sorted(rows_map2.keys()):
|
||||
row = rows_map2[ri]
|
||||
texts = [row.get(ci, "") for ci in all_ci]
|
||||
if not any(texts):
|
||||
continue
|
||||
page_vocabulary.append({
|
||||
"id": str(uuid.uuid4()),
|
||||
"english": texts[0] if len(texts) > 0 else "",
|
||||
"german": texts[1] if len(texts) > 1 else "",
|
||||
"example_sentence": " ".join(texts[2:]) if len(texts) > 2 else "",
|
||||
"source_page": page_number + 1,
|
||||
})
|
||||
extraction_source = f"generic ({len(all_ci)} cols)"
|
||||
|
||||
# --- Post-processing: merge cell-wrap continuation rows ---
|
||||
if len(page_vocabulary) >= 2:
|
||||
try:
|
||||
# Convert to internal format (example_sentence → example)
|
||||
internal = []
|
||||
for v in page_vocabulary:
|
||||
internal.append({
|
||||
'row_index': len(internal),
|
||||
'english': v.get('english', ''),
|
||||
'german': v.get('german', ''),
|
||||
'example': v.get('example_sentence', ''),
|
||||
})
|
||||
|
||||
n_before = len(internal)
|
||||
internal = _merge_wrapped_rows(internal)
|
||||
internal = _merge_phonetic_continuation_rows(internal)
|
||||
internal = _merge_continuation_rows(internal)
|
||||
|
||||
if len(internal) < n_before:
|
||||
# Rebuild page_vocabulary from merged entries
|
||||
merged_vocab = []
|
||||
for entry in internal:
|
||||
if not entry.get('english') and not entry.get('german'):
|
||||
continue
|
||||
merged_vocab.append({
|
||||
'id': str(uuid.uuid4()),
|
||||
'english': entry.get('english', ''),
|
||||
'german': entry.get('german', ''),
|
||||
'example_sentence': entry.get('example', ''),
|
||||
'source_page': page_number + 1,
|
||||
})
|
||||
logger.info(f" row merging: {n_before} → {len(merged_vocab)} entries")
|
||||
page_vocabulary = merged_vocab
|
||||
except Exception as e:
|
||||
logger.warning(f" row merging failed (non-critical): {e}")
|
||||
|
||||
logger.info(f" vocab extraction: {len(page_vocabulary)} entries via {extraction_source}")
|
||||
|
||||
total_duration = _time.time() - t_total
|
||||
logger.info(f"Kombi Pipeline page {page_number + 1}: "
|
||||
f"{len(page_vocabulary)} vocab entries in {total_duration:.1f}s")
|
||||
|
||||
return page_vocabulary, rotation, scan_quality_report
|
||||
490
klausur-service/backend/vocab_worksheet_upload_api.py
Normal file
490
klausur-service/backend/vocab_worksheet_upload_api.py
Normal file
@@ -0,0 +1,490 @@
|
||||
"""
|
||||
Vocab Worksheet Upload API — PDF upload, thumbnails, and page processing.
|
||||
|
||||
Extracted from vocab_worksheet_api.py to keep modules under 500 LOC.
|
||||
|
||||
Routes (no prefix — included into the main /api/v1/vocab router):
|
||||
POST /sessions/{session_id}/upload-pdf-info
|
||||
GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||
GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||
POST /sessions/{session_id}/process-single-page/{page_number}
|
||||
POST /sessions/{session_id}/process-pages
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, UploadFile, File
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from vocab_worksheet_models import SessionStatus, VocabularyEntry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Local storage path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LOCAL_STORAGE_PATH = os.getenv("VOCAB_STORAGE_PATH", "/app/vocab-worksheets")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional heavy dependencies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
from cv_preprocessing import render_pdf_high_res, detect_and_fix_orientation
|
||||
OCR_PIPELINE_AVAILABLE = True
|
||||
except ImportError:
|
||||
np = None # type: ignore[assignment]
|
||||
OCR_PIPELINE_AVAILABLE = False
|
||||
logger.warning("OCR pipeline imports not available in upload module")
|
||||
|
||||
# Sub-module imports (already split out)
|
||||
from vocab_worksheet_generation import (
|
||||
convert_pdf_page_to_image,
|
||||
convert_pdf_to_images,
|
||||
get_pdf_page_count,
|
||||
)
|
||||
from vocab_worksheet_extraction import extract_vocabulary_from_image
|
||||
|
||||
try:
|
||||
from vocab_worksheet_ocr import _run_ocr_pipeline_for_page
|
||||
except ImportError:
|
||||
_run_ocr_pipeline_for_page = None # type: ignore[assignment]
|
||||
logger.warning("vocab_worksheet_ocr not available — process-single-page disabled")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# In-memory session store (shared with main module)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_sessions():
|
||||
from vocab_worksheet_api import _sessions
|
||||
return _sessions
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Router (no prefix — will be included into the main vocab router)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
upload_router = APIRouter()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/upload-pdf-info
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/upload-pdf-info")
|
||||
async def upload_pdf_get_info(
|
||||
session_id: str,
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
"""
|
||||
Upload a PDF and get page count and thumbnails for preview.
|
||||
Use this before processing to let user select pages.
|
||||
"""
|
||||
logger.info(f"PDF info request for session {session_id}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
|
||||
# Validate file type
|
||||
extension = file.filename.split('.')[-1].lower() if file.filename else ''
|
||||
content_type = file.content_type or ''
|
||||
|
||||
if extension != 'pdf' and content_type != 'application/pdf':
|
||||
raise HTTPException(status_code=400, detail="Only PDF files supported for this endpoint")
|
||||
|
||||
content = await file.read()
|
||||
|
||||
# Save PDF temporarily
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
os.makedirs(session_dir, exist_ok=True)
|
||||
pdf_path = os.path.join(session_dir, "source.pdf")
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
# Get page count
|
||||
page_count = get_pdf_page_count(content)
|
||||
|
||||
# Store PDF data in session for later processing
|
||||
session["pdf_data"] = content
|
||||
session["pdf_path"] = pdf_path
|
||||
session["pdf_page_count"] = page_count
|
||||
session["status"] = "pdf_uploaded"
|
||||
|
||||
# Detect orientation for each page so thumbnails are shown correctly
|
||||
page_rotations: dict = {}
|
||||
if OCR_PIPELINE_AVAILABLE:
|
||||
for pg in range(page_count):
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(content, pg, zoom=2.0)
|
||||
_, rotation = detect_and_fix_orientation(img_bgr)
|
||||
if rotation:
|
||||
page_rotations[pg] = rotation
|
||||
logger.info(f"Page {pg + 1}: orientation {rotation}°")
|
||||
except Exception as e:
|
||||
logger.warning(f"Orientation detection failed for page {pg + 1}: {e}")
|
||||
session["page_rotations"] = page_rotations
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_count": page_count,
|
||||
"filename": file.filename,
|
||||
"page_rotations": page_rotations,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GET /sessions/{session_id}/pdf-thumbnail/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.get("/sessions/{session_id}/pdf-thumbnail/{page_number}")
|
||||
async def get_pdf_thumbnail(session_id: str, page_number: int, hires: bool = Query(False)):
|
||||
"""Get a thumbnail image of a specific PDF page.
|
||||
|
||||
Uses fitz for rendering so that page_rotations (from OCR orientation
|
||||
detection) are applied consistently.
|
||||
|
||||
Args:
|
||||
hires: If True, return full-resolution image (zoom=2.0) instead of thumbnail (zoom=0.5).
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
try:
|
||||
import fitz
|
||||
zoom = 2.0 if hires else 0.5
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
except Exception as e:
|
||||
logger.error(f"PDF thumbnail failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(png_data),
|
||||
media_type="image/png",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# GET /sessions/{session_id}/pdf-page-image/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.get("/sessions/{session_id}/pdf-page-image/{page_number}")
|
||||
async def get_pdf_page_image(session_id: str, page_number: int, zoom: float = Query(2.0, ge=0.5, le=4.0)):
|
||||
"""PDF page as PNG at arbitrary resolution (for editor view).
|
||||
|
||||
Args:
|
||||
zoom: Zoom factor (0.5=72DPI, 1.0=144DPI, 2.0=288DPI, 4.0=576DPI).
|
||||
"""
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
try:
|
||||
import fitz
|
||||
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
||||
page = pdf_document[page_number]
|
||||
# Apply orientation correction detected during OCR processing
|
||||
rot = session.get("page_rotations", {}).get(page_number, 0)
|
||||
if rot:
|
||||
page.set_rotation(rot)
|
||||
mat = fitz.Matrix(zoom, zoom)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
png_data = pix.tobytes("png")
|
||||
pdf_document.close()
|
||||
logger.info(f"PDF page {page_number} rendered at zoom={zoom} rot={rot}: {len(png_data)} bytes")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF page image failed: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"PDF rendering failed: {str(e)}")
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(png_data),
|
||||
media_type="image/png",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/process-single-page/{page_number}
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/process-single-page/{page_number}")
|
||||
async def process_single_page(
|
||||
session_id: str,
|
||||
page_number: int,
|
||||
ipa_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||
syllable_mode: str = Query("none", pattern="^(auto|all|de|en|none)$"),
|
||||
enhance: bool = Query(True, description="Step 3: CLAHE + Denoise for degraded scans"),
|
||||
max_cols: int = Query(3, description="Step 2: Max column count (0=unlimited)"),
|
||||
min_conf: int = Query(0, description="Step 1: Min OCR confidence (0=auto from quality score)"),
|
||||
):
|
||||
"""
|
||||
Process a SINGLE page of an uploaded PDF using the Kombi OCR pipeline.
|
||||
|
||||
Uses the full Kombi pipeline (orientation -> deskew -> dewarp -> crop ->
|
||||
dual-engine OCR -> grid-build with autocorrect/merge) for best quality.
|
||||
|
||||
Query params:
|
||||
ipa_mode: "none" (default), "auto", "all", "en", "de"
|
||||
syllable_mode: "none" (default), "auto", "all", "en", "de"
|
||||
enhance: true (default) -- apply CLAHE/denoise for degraded scans
|
||||
max_cols: 3 (default) -- max column count (0=unlimited)
|
||||
min_conf: 0 (default=auto) -- min OCR confidence (0=from quality score)
|
||||
|
||||
The frontend should call this sequentially for each page.
|
||||
Returns the vocabulary for just this one page.
|
||||
"""
|
||||
logger.info(f"Processing SINGLE page {page_number + 1} for session {session_id}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Session nicht im Speicher. Bitte erstellen Sie eine neue Session und laden Sie das PDF erneut hoch.",
|
||||
)
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
if page_number < 0 or page_number >= page_count:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid page number. PDF has {page_count} pages (0-indexed).")
|
||||
|
||||
# Derive pipeline-level variable names for the quality report
|
||||
enable_enhance = enhance
|
||||
max_columns = max_cols if max_cols > 0 else None
|
||||
override_min_conf = min_conf if min_conf > 0 else None
|
||||
|
||||
# --- OCR Pipeline path (use same render_pdf_high_res as admin OCR pipeline) ---
|
||||
rotation_deg = 0
|
||||
quality_report = None
|
||||
min_ocr_conf = 40 # default; overridden by pipeline when quality report is available
|
||||
if OCR_PIPELINE_AVAILABLE and _run_ocr_pipeline_for_page is not None:
|
||||
try:
|
||||
img_bgr = render_pdf_high_res(pdf_data, page_number, zoom=3.0)
|
||||
page_vocabulary, rotation_deg, quality_report = await _run_ocr_pipeline_for_page(
|
||||
img_bgr, page_number, session_id,
|
||||
ipa_mode=ipa_mode, syllable_mode=syllable_mode,
|
||||
enable_enhance=enable_enhance,
|
||||
max_columns=max_columns,
|
||||
override_min_conf=override_min_conf,
|
||||
)
|
||||
# Update min_ocr_conf from quality report if available
|
||||
if quality_report and hasattr(quality_report, 'recommended_min_conf'):
|
||||
min_ocr_conf = quality_report.recommended_min_conf
|
||||
except Exception as e:
|
||||
logger.error(f"OCR pipeline failed for page {page_number + 1}: {e}", exc_info=True)
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": False,
|
||||
"error": f"OCR pipeline error: {e}",
|
||||
"vocabulary": [],
|
||||
"vocabulary_count": 0,
|
||||
}
|
||||
else:
|
||||
# Fallback to LLM vision extraction
|
||||
logger.warning("OCR pipeline not available, falling back to LLM vision")
|
||||
image_data = await convert_pdf_page_to_image(pdf_data, page_number, thumbnail=False)
|
||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||
image_data,
|
||||
f"page_{page_number + 1}.png",
|
||||
page_number=page_number
|
||||
)
|
||||
if error:
|
||||
logger.warning(f"Page {page_number + 1} failed: {error}")
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": False,
|
||||
"error": error,
|
||||
"vocabulary": [],
|
||||
"vocabulary_count": 0,
|
||||
}
|
||||
page_vocabulary = []
|
||||
for entry in vocabulary:
|
||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||
entry_dict['source_page'] = page_number + 1
|
||||
if 'id' not in entry_dict or not entry_dict['id']:
|
||||
entry_dict['id'] = str(uuid.uuid4())
|
||||
page_vocabulary.append(entry_dict)
|
||||
|
||||
logger.info(f"Page {page_number + 1}: {len(page_vocabulary)} Vokabeln extrahiert")
|
||||
|
||||
# Store rotation for this page (used by image/thumbnail endpoints)
|
||||
session.setdefault("page_rotations", {})[page_number] = rotation_deg
|
||||
|
||||
# Add to session's vocabulary (append, don't replace)
|
||||
existing_vocab = session.get("vocabulary", [])
|
||||
# Remove any existing entries from this page (in case of re-processing)
|
||||
existing_vocab = [v for v in existing_vocab if v.get("source_page") != page_number + 1]
|
||||
existing_vocab.extend(page_vocabulary)
|
||||
session["vocabulary"] = existing_vocab
|
||||
session["vocabulary_count"] = len(existing_vocab)
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"page_number": page_number + 1,
|
||||
"success": True,
|
||||
"vocabulary": page_vocabulary,
|
||||
"vocabulary_count": len(page_vocabulary),
|
||||
"total_vocabulary_count": len(existing_vocab),
|
||||
"extraction_confidence": 0.9,
|
||||
"rotation": rotation_deg,
|
||||
}
|
||||
|
||||
# Add scan quality report + active steps info
|
||||
if quality_report:
|
||||
sq = quality_report.to_dict()
|
||||
sq["active_steps"] = {
|
||||
"step1_confidence": f"min_conf={min_ocr_conf}" if not override_min_conf else f"min_conf={override_min_conf} (override)",
|
||||
"step2_max_columns": f"max_cols={max_columns}" if max_columns else "unlimited",
|
||||
"step3_enhance": "on" if enable_enhance and quality_report.is_degraded else "off",
|
||||
}
|
||||
result["scan_quality"] = sq
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# POST /sessions/{session_id}/process-pages (DEPRECATED)
|
||||
# =============================================================================
|
||||
|
||||
@upload_router.post("/sessions/{session_id}/process-pages")
|
||||
async def process_pdf_pages(
|
||||
session_id: str,
|
||||
pages: List[int] = None,
|
||||
process_all: bool = False,
|
||||
):
|
||||
"""
|
||||
Process specific pages of an uploaded PDF.
|
||||
|
||||
DEPRECATED: Use /process-single-page/{page_number} instead for better results.
|
||||
|
||||
Args:
|
||||
pages: List of 0-indexed page numbers to process
|
||||
process_all: If True, process all pages
|
||||
"""
|
||||
logger.info(f"Process pages request for session {session_id}: pages={pages}, process_all={process_all}")
|
||||
|
||||
if session_id not in _get_sessions():
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = _get_sessions()[session_id]
|
||||
pdf_data = session.get("pdf_data")
|
||||
|
||||
if not pdf_data:
|
||||
raise HTTPException(status_code=400, detail="No PDF uploaded for this session")
|
||||
|
||||
page_count = session.get("pdf_page_count", 1)
|
||||
|
||||
# Determine which pages to process
|
||||
if process_all:
|
||||
pages = list(range(page_count))
|
||||
elif pages is None or len(pages) == 0:
|
||||
pages = [0] # Default to first page
|
||||
|
||||
# Convert selected pages to images
|
||||
images = await convert_pdf_to_images(pdf_data, pages)
|
||||
|
||||
# Extract vocabulary from each page SEQUENTIALLY
|
||||
all_vocabulary = []
|
||||
total_confidence = 0.0
|
||||
successful_pages = []
|
||||
failed_pages = []
|
||||
error_messages = []
|
||||
|
||||
for i, image_data in enumerate(images):
|
||||
page_num = pages[i]
|
||||
logger.info(f"Extracting vocabulary from page {page_num + 1} of {len(images)}...")
|
||||
|
||||
vocabulary, confidence, error = await extract_vocabulary_from_image(
|
||||
image_data,
|
||||
f"page_{page_num + 1}.png",
|
||||
page_number=page_num
|
||||
)
|
||||
|
||||
if error:
|
||||
failed_pages.append(page_num + 1)
|
||||
error_messages.append(error)
|
||||
logger.warning(f"Page {page_num + 1} failed: {error}")
|
||||
else:
|
||||
successful_pages.append(page_num + 1)
|
||||
total_confidence += confidence
|
||||
|
||||
# Add page info to each entry and convert to dict
|
||||
for entry in vocabulary:
|
||||
entry_dict = entry.dict() if hasattr(entry, 'dict') else (entry.__dict__.copy() if hasattr(entry, '__dict__') else dict(entry))
|
||||
entry_dict['source_page'] = page_num + 1
|
||||
all_vocabulary.append(entry_dict)
|
||||
|
||||
logger.info(f"Page {page_num + 1}: {len(vocabulary)} Vokabeln extrahiert")
|
||||
|
||||
avg_confidence = total_confidence / len(successful_pages) if successful_pages else 0
|
||||
|
||||
# Update session
|
||||
session["vocabulary"] = all_vocabulary
|
||||
session["vocabulary_count"] = len(all_vocabulary)
|
||||
session["extraction_confidence"] = avg_confidence
|
||||
session["processed_pages"] = pages
|
||||
session["successful_pages"] = successful_pages
|
||||
session["failed_pages"] = failed_pages
|
||||
session["status"] = SessionStatus.EXTRACTED.value
|
||||
|
||||
# Save first page as preview image
|
||||
if images:
|
||||
session_dir = os.path.join(LOCAL_STORAGE_PATH, session_id)
|
||||
image_path = os.path.join(session_dir, "source.png")
|
||||
with open(image_path, 'wb') as f:
|
||||
f.write(images[0])
|
||||
session["image_path"] = image_path
|
||||
|
||||
result = {
|
||||
"session_id": session_id,
|
||||
"pages_processed": len(pages),
|
||||
"pages_successful": len(successful_pages),
|
||||
"pages_failed": len(failed_pages),
|
||||
"successful_pages": successful_pages,
|
||||
"failed_pages": failed_pages,
|
||||
"vocabulary_count": len(all_vocabulary),
|
||||
"extraction_confidence": avg_confidence,
|
||||
"status": SessionStatus.EXTRACTED.value,
|
||||
}
|
||||
|
||||
if error_messages:
|
||||
result["errors"] = error_messages
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user