feat(ocr-pipeline): add row detection step with horizontal gap analysis
Add Step 4 (row detection) between column detection and word recognition. Uses horizontal projection profiles + whitespace gaps (same method as columns). Includes header/footer classification via gap-size heuristics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,20 @@ class ColumnGeometry:
|
|||||||
width_ratio: float # width / content_width (0.0-1.0)
|
width_ratio: float # width / content_width (0.0-1.0)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RowGeometry:
|
||||||
|
"""Geometrisch erkannte Zeile mit Kopf-/Fusszeilen-Klassifikation."""
|
||||||
|
index: int # 0-basiert, oben→unten
|
||||||
|
x: int # absolute left (= content left_x)
|
||||||
|
y: int # absolute y start
|
||||||
|
width: int # content width
|
||||||
|
height: int # Zeilenhoehe in px
|
||||||
|
word_count: int
|
||||||
|
words: List[Dict]
|
||||||
|
row_type: str = 'content' # 'content' | 'header' | 'footer'
|
||||||
|
gap_before: int = 0 # Gap in px ueber dieser Zeile
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class VocabRow:
|
class VocabRow:
|
||||||
"""A single vocabulary entry assembled from multi-column OCR."""
|
"""A single vocabulary entry assembled from multi-column OCR."""
|
||||||
@@ -885,7 +899,8 @@ def _detect_columns_by_clustering(
|
|||||||
right_x: int,
|
right_x: int,
|
||||||
top_y: int,
|
top_y: int,
|
||||||
bottom_y: int,
|
bottom_y: int,
|
||||||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
|
inv: Optional[np.ndarray] = None,
|
||||||
|
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]]:
|
||||||
"""Fallback: detect columns by clustering left-aligned word positions.
|
"""Fallback: detect columns by clustering left-aligned word positions.
|
||||||
|
|
||||||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||||||
@@ -965,7 +980,7 @@ def _detect_columns_by_clustering(
|
|||||||
margin_px = max(6, int(content_w * 0.003))
|
margin_px = max(6, int(content_w * 0.003))
|
||||||
return _build_geometries_from_starts(
|
return _build_geometries_from_starts(
|
||||||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h, inv,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -978,7 +993,8 @@ def _build_geometries_from_starts(
|
|||||||
bottom_y: int,
|
bottom_y: int,
|
||||||
content_w: int,
|
content_w: int,
|
||||||
content_h: int,
|
content_h: int,
|
||||||
) -> Tuple[List[ColumnGeometry], int, int, int, int]:
|
inv: Optional[np.ndarray] = None,
|
||||||
|
) -> Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], Optional[np.ndarray]]:
|
||||||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||||||
geometries = []
|
geometries = []
|
||||||
for i, (start_x, count) in enumerate(col_starts):
|
for i, (start_x, count) in enumerate(col_starts):
|
||||||
@@ -1005,10 +1021,10 @@ def _build_geometries_from_starts(
|
|||||||
|
|
||||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||||
return (geometries, left_x, right_x, top_y, bottom_y)
|
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||||
|
|
||||||
|
|
||||||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
|
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int, List[Dict], np.ndarray]]:
|
||||||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||||||
|
|
||||||
Phase A of the two-phase column detection. Uses vertical projection
|
Phase A of the two-phase column detection. Uses vertical projection
|
||||||
@@ -1022,8 +1038,8 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
|||||||
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
dewarped_bgr: Original BGR image (for Tesseract word detection).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if
|
Tuple of (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||||
detection fails entirely.
|
or None if detection fails entirely.
|
||||||
"""
|
"""
|
||||||
h, w = ocr_img.shape[:2]
|
h, w = ocr_img.shape[:2]
|
||||||
|
|
||||||
@@ -1165,7 +1181,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
|||||||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||||||
return _detect_columns_by_clustering(
|
return _detect_columns_by_clustering(
|
||||||
word_dicts, left_edges, edge_word_indices,
|
word_dicts, left_edges, edge_word_indices,
|
||||||
content_w, content_h, left_x, right_x, top_y, bottom_y,
|
content_w, content_h, left_x, right_x, top_y, bottom_y, inv,
|
||||||
)
|
)
|
||||||
|
|
||||||
# --- Step 7: Derive column boundaries from gaps ---
|
# --- Step 7: Derive column boundaries from gaps ---
|
||||||
@@ -1261,7 +1277,270 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
|||||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||||
|
|
||||||
return (geometries, left_x, right_x, top_y, bottom_y)
|
return (geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Row Geometry Detection (horizontal whitespace-gap analysis)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def detect_row_geometry(
|
||||||
|
inv: np.ndarray,
|
||||||
|
word_dicts: List[Dict],
|
||||||
|
left_x: int, right_x: int,
|
||||||
|
top_y: int, bottom_y: int,
|
||||||
|
) -> List['RowGeometry']:
|
||||||
|
"""Detect row geometry using horizontal whitespace-gap analysis.
|
||||||
|
|
||||||
|
Mirrors the vertical gap approach used for columns, but operates on
|
||||||
|
horizontal projection profiles to find gaps between text lines.
|
||||||
|
Also classifies header/footer rows based on gap size.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inv: Inverted binarized image (white text on black bg, full page).
|
||||||
|
word_dicts: Word bounding boxes from Tesseract (relative to content ROI).
|
||||||
|
left_x, right_x: Absolute X bounds of the content area.
|
||||||
|
top_y, bottom_y: Absolute Y bounds of the content area.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of RowGeometry objects sorted top to bottom.
|
||||||
|
"""
|
||||||
|
content_w = right_x - left_x
|
||||||
|
content_h = bottom_y - top_y
|
||||||
|
|
||||||
|
if content_h < 10 or content_w < 10:
|
||||||
|
logger.warning("detect_row_geometry: content area too small")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# --- Step 1: Horizontal projection profile ---
|
||||||
|
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||||
|
h_proj = np.sum(content_strip, axis=1).astype(float)
|
||||||
|
h_proj_norm = h_proj / (content_w * 255) if content_w > 0 else h_proj
|
||||||
|
|
||||||
|
# --- Step 2: Smoothing + threshold ---
|
||||||
|
kernel_size = max(3, content_h // 200)
|
||||||
|
if kernel_size % 2 == 0:
|
||||||
|
kernel_size += 1
|
||||||
|
h_smooth = np.convolve(h_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||||
|
|
||||||
|
median_density = float(np.median(h_smooth[h_smooth > 0])) if np.any(h_smooth > 0) else 0.01
|
||||||
|
gap_threshold = max(median_density * 0.15, 0.003)
|
||||||
|
|
||||||
|
in_gap = h_smooth < gap_threshold
|
||||||
|
MIN_GAP_HEIGHT = max(3, content_h // 500)
|
||||||
|
|
||||||
|
# --- Step 3: Collect contiguous gap regions ---
|
||||||
|
raw_gaps = [] # (start_y_rel, end_y_rel) relative to content ROI
|
||||||
|
gap_start = None
|
||||||
|
for y in range(len(in_gap)):
|
||||||
|
if in_gap[y]:
|
||||||
|
if gap_start is None:
|
||||||
|
gap_start = y
|
||||||
|
else:
|
||||||
|
if gap_start is not None:
|
||||||
|
gap_height = y - gap_start
|
||||||
|
if gap_height >= MIN_GAP_HEIGHT:
|
||||||
|
raw_gaps.append((gap_start, y))
|
||||||
|
gap_start = None
|
||||||
|
if gap_start is not None:
|
||||||
|
gap_height = len(in_gap) - gap_start
|
||||||
|
if gap_height >= MIN_GAP_HEIGHT:
|
||||||
|
raw_gaps.append((gap_start, len(in_gap)))
|
||||||
|
|
||||||
|
logger.info(f"RowGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||||
|
f"min_height={MIN_GAP_HEIGHT}px)")
|
||||||
|
|
||||||
|
# --- Step 4: Validate gaps against word bounding boxes ---
|
||||||
|
validated_gaps = []
|
||||||
|
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||||
|
overlapping = False
|
||||||
|
for wd in word_dicts:
|
||||||
|
word_top = wd['top']
|
||||||
|
word_bottom = wd['top'] + wd['height']
|
||||||
|
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||||
|
overlapping = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not overlapping:
|
||||||
|
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||||
|
else:
|
||||||
|
# Try to shift the gap to avoid overlapping words
|
||||||
|
min_word_top = content_h
|
||||||
|
max_word_bottom = 0
|
||||||
|
for wd in word_dicts:
|
||||||
|
word_top = wd['top']
|
||||||
|
word_bottom = wd['top'] + wd['height']
|
||||||
|
if word_top < gap_end_rel and word_bottom > gap_start_rel:
|
||||||
|
min_word_top = min(min_word_top, word_top)
|
||||||
|
max_word_bottom = max(max_word_bottom, word_bottom)
|
||||||
|
|
||||||
|
if min_word_top - gap_start_rel >= MIN_GAP_HEIGHT:
|
||||||
|
validated_gaps.append((gap_start_rel, min_word_top))
|
||||||
|
elif gap_end_rel - max_word_bottom >= MIN_GAP_HEIGHT:
|
||||||
|
validated_gaps.append((max_word_bottom, gap_end_rel))
|
||||||
|
else:
|
||||||
|
logger.debug(f"RowGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||||
|
f"discarded (word overlap, no room to shift)")
|
||||||
|
|
||||||
|
logger.info(f"RowGeometry: {len(validated_gaps)} gaps after word validation")
|
||||||
|
|
||||||
|
# --- Fallback if too few gaps ---
|
||||||
|
if len(validated_gaps) < 2:
|
||||||
|
logger.info("RowGeometry: < 2 gaps found, falling back to word grouping")
|
||||||
|
return _build_rows_from_word_grouping(
|
||||||
|
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
validated_gaps.sort(key=lambda g: g[0])
|
||||||
|
|
||||||
|
# --- Step 5: Header/footer detection via gap size ---
|
||||||
|
HEADER_FOOTER_ZONE = 0.15
|
||||||
|
GAP_MULTIPLIER = 2.0
|
||||||
|
|
||||||
|
gap_sizes = [g[1] - g[0] for g in validated_gaps]
|
||||||
|
median_gap = float(np.median(gap_sizes)) if gap_sizes else 0
|
||||||
|
large_gap_threshold = median_gap * GAP_MULTIPLIER
|
||||||
|
|
||||||
|
header_boundary_rel = None # y below which is header
|
||||||
|
footer_boundary_rel = None # y above which is footer
|
||||||
|
|
||||||
|
header_zone_limit = int(content_h * HEADER_FOOTER_ZONE)
|
||||||
|
footer_zone_start = int(content_h * (1.0 - HEADER_FOOTER_ZONE))
|
||||||
|
|
||||||
|
# Find largest gap in header zone
|
||||||
|
best_header_gap = None
|
||||||
|
for gs, ge in validated_gaps:
|
||||||
|
gap_mid = (gs + ge) / 2
|
||||||
|
gap_size = ge - gs
|
||||||
|
if gap_mid < header_zone_limit and gap_size > large_gap_threshold:
|
||||||
|
if best_header_gap is None or gap_size > (best_header_gap[1] - best_header_gap[0]):
|
||||||
|
best_header_gap = (gs, ge)
|
||||||
|
|
||||||
|
if best_header_gap is not None:
|
||||||
|
header_boundary_rel = best_header_gap[1]
|
||||||
|
logger.info(f"RowGeometry: header boundary at y_rel={header_boundary_rel} "
|
||||||
|
f"(gap={best_header_gap[1] - best_header_gap[0]}px, "
|
||||||
|
f"median_gap={median_gap:.0f}px)")
|
||||||
|
|
||||||
|
# Find largest gap in footer zone
|
||||||
|
best_footer_gap = None
|
||||||
|
for gs, ge in validated_gaps:
|
||||||
|
gap_mid = (gs + ge) / 2
|
||||||
|
gap_size = ge - gs
|
||||||
|
if gap_mid > footer_zone_start and gap_size > large_gap_threshold:
|
||||||
|
if best_footer_gap is None or gap_size > (best_footer_gap[1] - best_footer_gap[0]):
|
||||||
|
best_footer_gap = (gs, ge)
|
||||||
|
|
||||||
|
if best_footer_gap is not None:
|
||||||
|
footer_boundary_rel = best_footer_gap[0]
|
||||||
|
logger.info(f"RowGeometry: footer boundary at y_rel={footer_boundary_rel} "
|
||||||
|
f"(gap={best_footer_gap[1] - best_footer_gap[0]}px)")
|
||||||
|
|
||||||
|
# --- Step 6: Build RowGeometry objects from gaps ---
|
||||||
|
# Rows are the spans between gaps
|
||||||
|
row_boundaries = [] # (start_y_rel, end_y_rel)
|
||||||
|
|
||||||
|
# Top of content to first gap
|
||||||
|
if validated_gaps[0][0] > MIN_GAP_HEIGHT:
|
||||||
|
row_boundaries.append((0, validated_gaps[0][0]))
|
||||||
|
|
||||||
|
# Between gaps
|
||||||
|
for i in range(len(validated_gaps) - 1):
|
||||||
|
row_start = validated_gaps[i][1]
|
||||||
|
row_end = validated_gaps[i + 1][0]
|
||||||
|
if row_end - row_start > 0:
|
||||||
|
row_boundaries.append((row_start, row_end))
|
||||||
|
|
||||||
|
# Last gap to bottom of content
|
||||||
|
if validated_gaps[-1][1] < content_h - MIN_GAP_HEIGHT:
|
||||||
|
row_boundaries.append((validated_gaps[-1][1], content_h))
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for idx, (row_start_rel, row_end_rel) in enumerate(row_boundaries):
|
||||||
|
# Determine row type
|
||||||
|
row_mid = (row_start_rel + row_end_rel) / 2
|
||||||
|
if header_boundary_rel is not None and row_mid < header_boundary_rel:
|
||||||
|
row_type = 'header'
|
||||||
|
elif footer_boundary_rel is not None and row_mid > footer_boundary_rel:
|
||||||
|
row_type = 'footer'
|
||||||
|
else:
|
||||||
|
row_type = 'content'
|
||||||
|
|
||||||
|
# Collect words in this row
|
||||||
|
row_words = [w for w in word_dicts
|
||||||
|
if w['top'] + w['height'] / 2 >= row_start_rel
|
||||||
|
and w['top'] + w['height'] / 2 < row_end_rel]
|
||||||
|
|
||||||
|
# Gap before this row
|
||||||
|
gap_before = 0
|
||||||
|
if idx == 0 and validated_gaps[0][0] > 0:
|
||||||
|
gap_before = validated_gaps[0][0]
|
||||||
|
elif idx > 0:
|
||||||
|
# Find the gap just before this row boundary
|
||||||
|
for gs, ge in validated_gaps:
|
||||||
|
if ge == row_start_rel:
|
||||||
|
gap_before = ge - gs
|
||||||
|
break
|
||||||
|
|
||||||
|
rows.append(RowGeometry(
|
||||||
|
index=idx,
|
||||||
|
x=left_x,
|
||||||
|
y=top_y + row_start_rel,
|
||||||
|
width=content_w,
|
||||||
|
height=row_end_rel - row_start_rel,
|
||||||
|
word_count=len(row_words),
|
||||||
|
words=row_words,
|
||||||
|
row_type=row_type,
|
||||||
|
gap_before=gap_before,
|
||||||
|
))
|
||||||
|
|
||||||
|
type_counts = {}
|
||||||
|
for r in rows:
|
||||||
|
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||||||
|
logger.info(f"RowGeometry: {len(rows)} rows detected: {type_counts}")
|
||||||
|
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _build_rows_from_word_grouping(
|
||||||
|
word_dicts: List[Dict],
|
||||||
|
left_x: int, right_x: int,
|
||||||
|
top_y: int, bottom_y: int,
|
||||||
|
content_w: int, content_h: int,
|
||||||
|
) -> List['RowGeometry']:
|
||||||
|
"""Fallback: build rows by grouping words by Y position.
|
||||||
|
|
||||||
|
Uses _group_words_into_lines() with a generous tolerance.
|
||||||
|
No header/footer detection in fallback mode.
|
||||||
|
"""
|
||||||
|
if not word_dicts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
y_tolerance = max(20, content_h // 100)
|
||||||
|
lines = _group_words_into_lines(word_dicts, y_tolerance_px=y_tolerance)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for idx, line_words in enumerate(lines):
|
||||||
|
if not line_words:
|
||||||
|
continue
|
||||||
|
min_top = min(w['top'] for w in line_words)
|
||||||
|
max_bottom = max(w['top'] + w['height'] for w in line_words)
|
||||||
|
row_height = max_bottom - min_top
|
||||||
|
|
||||||
|
rows.append(RowGeometry(
|
||||||
|
index=idx,
|
||||||
|
x=left_x,
|
||||||
|
y=top_y + min_top,
|
||||||
|
width=content_w,
|
||||||
|
height=row_height,
|
||||||
|
word_count=len(line_words),
|
||||||
|
words=line_words,
|
||||||
|
row_type='content',
|
||||||
|
gap_before=0,
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info(f"RowGeometry (fallback): {len(rows)} rows from word grouping")
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
# --- Phase B: Content-Based Classification ---
|
# --- Phase B: Content-Based Classification ---
|
||||||
@@ -1861,7 +2140,7 @@ def analyze_layout_by_words(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Li
|
|||||||
layout_img = create_layout_image(dewarped_bgr)
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
return analyze_layout(layout_img, ocr_img)
|
return analyze_layout(layout_img, ocr_img)
|
||||||
|
|
||||||
geometries, left_x, right_x, top_y, bottom_y = result
|
geometries, left_x, right_x, top_y, bottom_y, _word_dicts, _inv = result
|
||||||
content_w = right_x - left_x
|
content_w = right_x - left_x
|
||||||
|
|
||||||
# Phase B: Content-based classification
|
# Phase B: Content-based classification
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
-- Migration 003: Add row_result column for row geometry detection
|
||||||
|
-- Stores detected row geometries including header/footer classification
|
||||||
|
|
||||||
|
ALTER TABLE ocr_pipeline_sessions ADD COLUMN IF NOT EXISTS row_result JSONB;
|
||||||
@@ -1,14 +1,15 @@
|
|||||||
"""
|
"""
|
||||||
OCR Pipeline API - Schrittweise Seitenrekonstruktion.
|
OCR Pipeline API - Schrittweise Seitenrekonstruktion.
|
||||||
|
|
||||||
Zerlegt den OCR-Prozess in 7 einzelne Schritte:
|
Zerlegt den OCR-Prozess in 8 einzelne Schritte:
|
||||||
1. Deskewing - Scan begradigen
|
1. Deskewing - Scan begradigen
|
||||||
2. Dewarping - Buchwoelbung entzerren
|
2. Dewarping - Buchwoelbung entzerren
|
||||||
3. Spaltenerkennung - Unsichtbare Spalten finden
|
3. Spaltenerkennung - Unsichtbare Spalten finden
|
||||||
4. Worterkennung - OCR mit Bounding Boxes
|
4. Zeilenerkennung - Horizontale Zeilen + Kopf-/Fusszeilen
|
||||||
5. Koordinatenzuweisung - Exakte Positionen
|
5. Worterkennung - OCR mit Bounding Boxes
|
||||||
6. Seitenrekonstruktion - Seite nachbauen
|
6. Koordinatenzuweisung - Exakte Positionen
|
||||||
7. Ground Truth Validierung - Gesamtpruefung
|
7. Seitenrekonstruktion - Seite nachbauen
|
||||||
|
8. Ground Truth Validierung - Gesamtpruefung
|
||||||
|
|
||||||
Lizenz: Apache 2.0
|
Lizenz: Apache 2.0
|
||||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||||
@@ -30,9 +31,13 @@ from pydantic import BaseModel
|
|||||||
from cv_vocab_pipeline import (
|
from cv_vocab_pipeline import (
|
||||||
analyze_layout,
|
analyze_layout,
|
||||||
analyze_layout_by_words,
|
analyze_layout_by_words,
|
||||||
|
classify_column_types,
|
||||||
|
create_layout_image,
|
||||||
create_ocr_image,
|
create_ocr_image,
|
||||||
deskew_image,
|
deskew_image,
|
||||||
deskew_image_by_word_alignment,
|
deskew_image_by_word_alignment,
|
||||||
|
detect_column_geometry,
|
||||||
|
detect_row_geometry,
|
||||||
dewarp_image,
|
dewarp_image,
|
||||||
dewarp_image_manual,
|
dewarp_image_manual,
|
||||||
render_image_high_res,
|
render_image_high_res,
|
||||||
@@ -139,6 +144,16 @@ class ColumnGroundTruthRequest(BaseModel):
|
|||||||
notes: Optional[str] = None
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ManualRowsRequest(BaseModel):
|
||||||
|
rows: List[Dict[str, Any]]
|
||||||
|
|
||||||
|
|
||||||
|
class RowGroundTruthRequest(BaseModel):
|
||||||
|
is_correct: bool
|
||||||
|
corrected_rows: Optional[List[Dict[str, Any]]] = None
|
||||||
|
notes: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Session Management Endpoints
|
# Session Management Endpoints
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -275,14 +290,17 @@ async def delete_session(session_id: str):
|
|||||||
|
|
||||||
@router.get("/sessions/{session_id}/image/{image_type}")
|
@router.get("/sessions/{session_id}/image/{image_type}")
|
||||||
async def get_image(session_id: str, image_type: str):
|
async def get_image(session_id: str, image_type: str):
|
||||||
"""Serve session images: original, deskewed, dewarped, binarized, or columns-overlay."""
|
"""Serve session images: original, deskewed, dewarped, binarized, columns-overlay, or rows-overlay."""
|
||||||
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay"}
|
valid_types = {"original", "deskewed", "dewarped", "binarized", "columns-overlay", "rows-overlay"}
|
||||||
if image_type not in valid_types:
|
if image_type not in valid_types:
|
||||||
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
|
raise HTTPException(status_code=400, detail=f"Unknown image type: {image_type}")
|
||||||
|
|
||||||
if image_type == "columns-overlay":
|
if image_type == "columns-overlay":
|
||||||
return await _get_columns_overlay(session_id)
|
return await _get_columns_overlay(session_id)
|
||||||
|
|
||||||
|
if image_type == "rows-overlay":
|
||||||
|
return await _get_rows_overlay(session_id)
|
||||||
|
|
||||||
# Try cache first for fast serving
|
# Try cache first for fast serving
|
||||||
cached = _cache.get(session_id)
|
cached = _cache.get(session_id)
|
||||||
if cached:
|
if cached:
|
||||||
@@ -643,9 +661,27 @@ async def detect_columns(session_id: str):
|
|||||||
|
|
||||||
# Binarized image for layout analysis
|
# Binarized image for layout analysis
|
||||||
ocr_img = create_ocr_image(dewarped_bgr)
|
ocr_img = create_ocr_image(dewarped_bgr)
|
||||||
|
h, w = ocr_img.shape[:2]
|
||||||
|
|
||||||
|
# Phase A: Geometry detection (returns word_dicts + inv for reuse)
|
||||||
|
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||||||
|
|
||||||
|
if geo_result is None:
|
||||||
|
# Fallback to projection-based layout
|
||||||
|
layout_img = create_layout_image(dewarped_bgr)
|
||||||
|
regions = analyze_layout(layout_img, ocr_img)
|
||||||
|
else:
|
||||||
|
geometries, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||||
|
content_w = right_x - left_x
|
||||||
|
|
||||||
|
# Cache intermediates for row detection (avoids second Tesseract run)
|
||||||
|
cached["_word_dicts"] = word_dicts
|
||||||
|
cached["_inv"] = inv
|
||||||
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||||
|
|
||||||
|
# Phase B: Content-based classification
|
||||||
|
regions = classify_column_types(geometries, content_w, top_y, w, h, bottom_y)
|
||||||
|
|
||||||
# Word-based detection (with automatic fallback to projection profiles)
|
|
||||||
regions = analyze_layout_by_words(ocr_img, dewarped_bgr)
|
|
||||||
duration = time.time() - t0
|
duration = time.time() - t0
|
||||||
|
|
||||||
columns = [asdict(r) for r in regions]
|
columns = [asdict(r) for r in regions]
|
||||||
@@ -807,3 +843,209 @@ async def _get_columns_overlay(session_id: str) -> Response:
|
|||||||
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
|
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
|
||||||
|
|
||||||
return Response(content=result_png.tobytes(), media_type="image/png")
|
return Response(content=result_png.tobytes(), media_type="image/png")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Row Detection Endpoints
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/rows")
|
||||||
|
async def detect_rows(session_id: str):
|
||||||
|
"""Run row detection on the dewarped image using horizontal gap analysis."""
|
||||||
|
if session_id not in _cache:
|
||||||
|
await _load_session_to_cache(session_id)
|
||||||
|
cached = _get_cached(session_id)
|
||||||
|
|
||||||
|
dewarped_bgr = cached.get("dewarped_bgr")
|
||||||
|
if dewarped_bgr is None:
|
||||||
|
raise HTTPException(status_code=400, detail="Dewarp must be completed before row detection")
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
|
||||||
|
# Try to reuse cached word_dicts and inv from column detection
|
||||||
|
word_dicts = cached.get("_word_dicts")
|
||||||
|
inv = cached.get("_inv")
|
||||||
|
content_bounds = cached.get("_content_bounds")
|
||||||
|
|
||||||
|
if word_dicts is None or inv is None or content_bounds is None:
|
||||||
|
# Not cached — run column geometry to get intermediates
|
||||||
|
ocr_img = create_ocr_image(dewarped_bgr)
|
||||||
|
geo_result = detect_column_geometry(ocr_img, dewarped_bgr)
|
||||||
|
if geo_result is None:
|
||||||
|
raise HTTPException(status_code=400, detail="Column geometry detection failed — cannot detect rows")
|
||||||
|
_geoms, left_x, right_x, top_y, bottom_y, word_dicts, inv = geo_result
|
||||||
|
cached["_word_dicts"] = word_dicts
|
||||||
|
cached["_inv"] = inv
|
||||||
|
cached["_content_bounds"] = (left_x, right_x, top_y, bottom_y)
|
||||||
|
else:
|
||||||
|
left_x, right_x, top_y, bottom_y = content_bounds
|
||||||
|
|
||||||
|
# Run row detection
|
||||||
|
rows = detect_row_geometry(inv, word_dicts, left_x, right_x, top_y, bottom_y)
|
||||||
|
duration = time.time() - t0
|
||||||
|
|
||||||
|
# Build serializable result (exclude words to keep payload small)
|
||||||
|
rows_data = []
|
||||||
|
for r in rows:
|
||||||
|
rows_data.append({
|
||||||
|
"index": r.index,
|
||||||
|
"x": r.x,
|
||||||
|
"y": r.y,
|
||||||
|
"width": r.width,
|
||||||
|
"height": r.height,
|
||||||
|
"word_count": r.word_count,
|
||||||
|
"row_type": r.row_type,
|
||||||
|
"gap_before": r.gap_before,
|
||||||
|
})
|
||||||
|
|
||||||
|
type_counts = {}
|
||||||
|
for r in rows:
|
||||||
|
type_counts[r.row_type] = type_counts.get(r.row_type, 0) + 1
|
||||||
|
|
||||||
|
row_result = {
|
||||||
|
"rows": rows_data,
|
||||||
|
"summary": type_counts,
|
||||||
|
"total_rows": len(rows),
|
||||||
|
"duration_seconds": round(duration, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Persist to DB
|
||||||
|
await update_session_db(
|
||||||
|
session_id,
|
||||||
|
row_result=row_result,
|
||||||
|
current_step=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
cached["row_result"] = row_result
|
||||||
|
|
||||||
|
logger.info(f"OCR Pipeline: rows session {session_id}: "
|
||||||
|
f"{len(rows)} rows detected ({duration:.2f}s): {type_counts}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
**row_result,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/rows/manual")
|
||||||
|
async def set_manual_rows(session_id: str, req: ManualRowsRequest):
|
||||||
|
"""Override detected rows with manual definitions."""
|
||||||
|
row_result = {
|
||||||
|
"rows": req.rows,
|
||||||
|
"total_rows": len(req.rows),
|
||||||
|
"duration_seconds": 0,
|
||||||
|
"method": "manual",
|
||||||
|
}
|
||||||
|
|
||||||
|
await update_session_db(session_id, row_result=row_result)
|
||||||
|
|
||||||
|
if session_id in _cache:
|
||||||
|
_cache[session_id]["row_result"] = row_result
|
||||||
|
|
||||||
|
logger.info(f"OCR Pipeline: manual rows session {session_id}: "
|
||||||
|
f"{len(req.rows)} rows set")
|
||||||
|
|
||||||
|
return {"session_id": session_id, **row_result}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sessions/{session_id}/ground-truth/rows")
|
||||||
|
async def save_row_ground_truth(session_id: str, req: RowGroundTruthRequest):
|
||||||
|
"""Save ground truth feedback for the row detection step."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
gt = {
|
||||||
|
"is_correct": req.is_correct,
|
||||||
|
"corrected_rows": req.corrected_rows,
|
||||||
|
"notes": req.notes,
|
||||||
|
"saved_at": datetime.utcnow().isoformat(),
|
||||||
|
"row_result": session.get("row_result"),
|
||||||
|
}
|
||||||
|
ground_truth["rows"] = gt
|
||||||
|
|
||||||
|
await update_session_db(session_id, ground_truth=ground_truth)
|
||||||
|
|
||||||
|
if session_id in _cache:
|
||||||
|
_cache[session_id]["ground_truth"] = ground_truth
|
||||||
|
|
||||||
|
return {"session_id": session_id, "ground_truth": gt}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sessions/{session_id}/ground-truth/rows")
|
||||||
|
async def get_row_ground_truth(session_id: str):
|
||||||
|
"""Retrieve saved ground truth for row detection."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
ground_truth = session.get("ground_truth") or {}
|
||||||
|
rows_gt = ground_truth.get("rows")
|
||||||
|
if not rows_gt:
|
||||||
|
raise HTTPException(status_code=404, detail="No row ground truth saved")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"rows_gt": rows_gt,
|
||||||
|
"rows_auto": session.get("row_result"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_rows_overlay(session_id: str) -> Response:
|
||||||
|
"""Generate dewarped image with row bands drawn on it."""
|
||||||
|
session = await get_session_db(session_id)
|
||||||
|
if not session:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
|
||||||
|
|
||||||
|
row_result = session.get("row_result")
|
||||||
|
if not row_result or not row_result.get("rows"):
|
||||||
|
raise HTTPException(status_code=404, detail="No row data available")
|
||||||
|
|
||||||
|
# Load dewarped image
|
||||||
|
dewarped_png = await get_session_image(session_id, "dewarped")
|
||||||
|
if not dewarped_png:
|
||||||
|
raise HTTPException(status_code=404, detail="Dewarped image not available")
|
||||||
|
|
||||||
|
arr = np.frombuffer(dewarped_png, dtype=np.uint8)
|
||||||
|
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
||||||
|
if img is None:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to decode image")
|
||||||
|
|
||||||
|
# Color map for row types (BGR)
|
||||||
|
row_colors = {
|
||||||
|
"content": (255, 180, 0), # Blue
|
||||||
|
"header": (128, 128, 128), # Gray
|
||||||
|
"footer": (128, 128, 128), # Gray
|
||||||
|
}
|
||||||
|
|
||||||
|
overlay = img.copy()
|
||||||
|
for row in row_result["rows"]:
|
||||||
|
x, y = row["x"], row["y"]
|
||||||
|
w, h = row["width"], row["height"]
|
||||||
|
row_type = row.get("row_type", "content")
|
||||||
|
color = row_colors.get(row_type, (200, 200, 200))
|
||||||
|
|
||||||
|
# Semi-transparent fill
|
||||||
|
cv2.rectangle(overlay, (x, y), (x + w, y + h), color, -1)
|
||||||
|
|
||||||
|
# Solid border
|
||||||
|
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
|
||||||
|
|
||||||
|
# Label
|
||||||
|
idx = row.get("index", 0)
|
||||||
|
label = f"R{idx} {row_type.upper()}"
|
||||||
|
wc = row.get("word_count", 0)
|
||||||
|
if wc:
|
||||||
|
label = f"{label} ({wc}w)"
|
||||||
|
cv2.putText(img, label, (x + 5, y + 18),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
|
||||||
|
|
||||||
|
# Blend overlay at 15% opacity
|
||||||
|
cv2.addWeighted(overlay, 0.15, img, 0.85, 0, img)
|
||||||
|
|
||||||
|
success, result_png = cv2.imencode(".png", img)
|
||||||
|
if not success:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to encode overlay image")
|
||||||
|
|
||||||
|
return Response(content=result_png.tobytes(), media_type="image/png")
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ async def create_session_db(
|
|||||||
id, name, filename, original_png, status, current_step
|
id, name, filename, original_png, status, current_step
|
||||||
) VALUES ($1, $2, $3, $4, 'active', 1)
|
) VALUES ($1, $2, $3, $4, 'active', 1)
|
||||||
RETURNING id, name, filename, status, current_step,
|
RETURNING id, name, filename, status, current_step,
|
||||||
deskew_result, dewarp_result, column_result,
|
deskew_result, dewarp_result, column_result, row_result,
|
||||||
ground_truth, auto_shear_degrees,
|
ground_truth, auto_shear_degrees,
|
||||||
created_at, updated_at
|
created_at, updated_at
|
||||||
""", uuid.UUID(session_id), name, filename, original_png)
|
""", uuid.UUID(session_id), name, filename, original_png)
|
||||||
@@ -93,7 +93,7 @@ async def get_session_db(session_id: str) -> Optional[Dict[str, Any]]:
|
|||||||
async with pool.acquire() as conn:
|
async with pool.acquire() as conn:
|
||||||
row = await conn.fetchrow("""
|
row = await conn.fetchrow("""
|
||||||
SELECT id, name, filename, status, current_step,
|
SELECT id, name, filename, status, current_step,
|
||||||
deskew_result, dewarp_result, column_result,
|
deskew_result, dewarp_result, column_result, row_result,
|
||||||
ground_truth, auto_shear_degrees,
|
ground_truth, auto_shear_degrees,
|
||||||
created_at, updated_at
|
created_at, updated_at
|
||||||
FROM ocr_pipeline_sessions WHERE id = $1
|
FROM ocr_pipeline_sessions WHERE id = $1
|
||||||
@@ -135,11 +135,11 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
|||||||
allowed_fields = {
|
allowed_fields = {
|
||||||
'name', 'filename', 'status', 'current_step',
|
'name', 'filename', 'status', 'current_step',
|
||||||
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
|
'original_png', 'deskewed_png', 'binarized_png', 'dewarped_png',
|
||||||
'deskew_result', 'dewarp_result', 'column_result',
|
'deskew_result', 'dewarp_result', 'column_result', 'row_result',
|
||||||
'ground_truth', 'auto_shear_degrees',
|
'ground_truth', 'auto_shear_degrees',
|
||||||
}
|
}
|
||||||
|
|
||||||
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'ground_truth'}
|
jsonb_fields = {'deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth'}
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key in allowed_fields:
|
if key in allowed_fields:
|
||||||
@@ -163,7 +163,7 @@ async def update_session_db(session_id: str, **kwargs) -> Optional[Dict[str, Any
|
|||||||
SET {', '.join(fields)}
|
SET {', '.join(fields)}
|
||||||
WHERE id = ${param_idx}
|
WHERE id = ${param_idx}
|
||||||
RETURNING id, name, filename, status, current_step,
|
RETURNING id, name, filename, status, current_step,
|
||||||
deskew_result, dewarp_result, column_result,
|
deskew_result, dewarp_result, column_result, row_result,
|
||||||
ground_truth, auto_shear_degrees,
|
ground_truth, auto_shear_degrees,
|
||||||
created_at, updated_at
|
created_at, updated_at
|
||||||
""", *values)
|
""", *values)
|
||||||
@@ -220,7 +220,7 @@ def _row_to_dict(row: asyncpg.Record) -> Dict[str, Any]:
|
|||||||
result[key] = result[key].isoformat()
|
result[key] = result[key].isoformat()
|
||||||
|
|
||||||
# JSONB → parsed (asyncpg returns str for JSONB)
|
# JSONB → parsed (asyncpg returns str for JSONB)
|
||||||
for key in ['deskew_result', 'dewarp_result', 'column_result', 'ground_truth']:
|
for key in ['deskew_result', 'dewarp_result', 'column_result', 'row_result', 'ground_truth']:
|
||||||
if key in result and result[key] is not None:
|
if key in result and result[key] is not None:
|
||||||
if isinstance(result[key], str):
|
if isinstance(result[key], str):
|
||||||
result[key] = json.loads(result[key])
|
result[key] = json.loads(result[key])
|
||||||
|
|||||||
Reference in New Issue
Block a user