Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
330 lines
13 KiB
Python
330 lines
13 KiB
Python
"""
|
||
Row grid regularization for document layout analysis.
|
||
|
||
Provides word-center-based row boundary refinement to improve
|
||
gap-based row detection. Extracted from cv_layout_rows.py.
|
||
|
||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||
"""
|
||
|
||
import logging
|
||
from typing import Dict, List
|
||
|
||
import numpy as np
|
||
|
||
from cv_vocab_types import RowGeometry
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _regularize_row_grid(
|
||
rows: List['RowGeometry'],
|
||
word_dicts: List[Dict],
|
||
left_x: int, right_x: int,
|
||
top_y: int,
|
||
content_w: int, content_h: int,
|
||
inv: np.ndarray,
|
||
) -> List['RowGeometry']:
|
||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||
|
||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||
from the words themselves:
|
||
|
||
Step A: Group all content words into line clusters by Y-proximity.
|
||
Tolerance = 40% of median gap-based row height.
|
||
Step B: For each cluster compute:
|
||
- center_y = median of (word_top + word_height/2) for all words
|
||
- letter_h = median of word heights (excluding outliers > 2× median)
|
||
Step B2: Merge clusters whose centers are closer than 30% of row height
|
||
(spurious splits from OCR jitter).
|
||
Step C: Compute pitches (distances between consecutive centers).
|
||
Detect section breaks where gap > 1.8× median pitch.
|
||
Step D: Split clusters into sections at the section breaks.
|
||
Step E: Within each section, place row boundaries at midpoints between
|
||
consecutive line centers:
|
||
- First row top = center - local_pitch/2
|
||
- Last row bottom = center + local_pitch/2
|
||
- Interior boundaries = (center_i + center_{i+1}) / 2
|
||
This ensures rows tile seamlessly without gaps or overlaps.
|
||
Step F: Re-assign words to the nearest grid row by vertical center distance.
|
||
Step G: Validate that >= 85% of words land in a grid row; otherwise
|
||
fall back to the original gap-based rows.
|
||
Step H: Merge with preserved header/footer rows and re-index.
|
||
|
||
Guard: Requires >= 5 content rows from gap-based detection to activate.
|
||
This prevents the regularizer from running on very small images (e.g.
|
||
box sub-sessions with only 3-6 rows) where the gap-based detection
|
||
is already accurate enough.
|
||
|
||
Header/footer rows from the gap-based detection are preserved.
|
||
"""
|
||
content_rows = [r for r in rows if r.row_type == 'content']
|
||
non_content = [r for r in rows if r.row_type != 'content']
|
||
|
||
if len(content_rows) < 5:
|
||
return rows
|
||
|
||
# --- Step A: Group ALL words into line clusters ---
|
||
# Collect words that belong to content rows (deduplicated)
|
||
content_words: List[Dict] = []
|
||
seen_keys: set = set()
|
||
for r in content_rows:
|
||
for w in r.words:
|
||
key = (w['left'], w['top'], w['width'], w['height'])
|
||
if key not in seen_keys:
|
||
seen_keys.add(key)
|
||
content_words.append(w)
|
||
|
||
if len(content_words) < 5:
|
||
return rows
|
||
|
||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||
word_heights = sorted(w['height'] for w in content_words)
|
||
median_wh = word_heights[len(word_heights) // 2]
|
||
|
||
# Compute median gap-based row height — this is the actual line height
|
||
# as detected by the horizontal projection. We use 40% of this as
|
||
# grouping tolerance. This is much more reliable than using word height
|
||
# alone, because words on the same line can have very different heights
|
||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||
gap_row_heights = sorted(r.height for r in content_rows)
|
||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||
|
||
# Tolerance: 40% of row height. Words on the same line should have
|
||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||
# its center should stay within half a row height of the line center.
|
||
y_tol = max(10, int(median_row_h * 0.4))
|
||
|
||
# Sort by center_y, then group by proximity
|
||
words_by_center = sorted(content_words,
|
||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||
line_clusters: List[List[Dict]] = []
|
||
current_line: List[Dict] = [words_by_center[0]]
|
||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||
|
||
for w in words_by_center[1:]:
|
||
w_center = w['top'] + w['height'] / 2
|
||
if abs(w_center - current_center) <= y_tol:
|
||
current_line.append(w)
|
||
else:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
current_line = [w]
|
||
current_center = w_center
|
||
|
||
if current_line:
|
||
current_line.sort(key=lambda w: w['left'])
|
||
line_clusters.append(current_line)
|
||
|
||
if len(line_clusters) < 3:
|
||
return rows
|
||
|
||
# --- Step B: Compute center_y per cluster ---
|
||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||
# letter_h = median of word heights, but excluding outlier-height words
|
||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||
cluster_info: List[Dict] = []
|
||
for cl_words in line_clusters:
|
||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||
# Filter outlier heights for letter_h computation
|
||
normal_heights = [w['height'] for w in cl_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in cl_words]
|
||
center_y = float(np.median(centers))
|
||
letter_h = float(np.median(normal_heights))
|
||
cluster_info.append({
|
||
'center_y_rel': center_y, # relative to content ROI
|
||
'center_y_abs': center_y + top_y, # absolute
|
||
'letter_h': letter_h,
|
||
'words': cl_words,
|
||
})
|
||
|
||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||
|
||
# --- Step B2: Merge clusters that are too close together ---
|
||
# Even with center-based grouping, some edge cases can produce
|
||
# spurious clusters. Merge any pair whose centers are closer
|
||
# than 30% of the row height (they're definitely the same text line).
|
||
merge_threshold = max(8, median_row_h * 0.3)
|
||
merged: List[Dict] = [cluster_info[0]]
|
||
for cl in cluster_info[1:]:
|
||
prev = merged[-1]
|
||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||
# Merge: combine words, recompute center
|
||
combined_words = prev['words'] + cl['words']
|
||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||
normal_heights = [w['height'] for w in combined_words
|
||
if w['height'] <= median_wh * 2.0]
|
||
if not normal_heights:
|
||
normal_heights = [w['height'] for w in combined_words]
|
||
prev['center_y_rel'] = float(np.median(centers))
|
||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||
prev['letter_h'] = float(np.median(normal_heights))
|
||
prev['words'] = combined_words
|
||
else:
|
||
merged.append(cl)
|
||
|
||
cluster_info = merged
|
||
|
||
if len(cluster_info) < 3:
|
||
return rows
|
||
|
||
# --- Step C: Compute pitches and detect section breaks ---
|
||
pitches: List[float] = []
|
||
for i in range(1, len(cluster_info)):
|
||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
pitches.append(pitch)
|
||
|
||
if not pitches:
|
||
return rows
|
||
|
||
median_pitch = float(np.median(pitches))
|
||
if median_pitch <= 5:
|
||
return rows
|
||
|
||
# A section break is where the gap between line centers is much larger
|
||
# than the normal pitch (sub-headings, section titles, etc.)
|
||
BREAK_FACTOR = 1.8
|
||
|
||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||
sections: List[List[Dict]] = []
|
||
current_section: List[Dict] = [cluster_info[0]]
|
||
|
||
for i in range(1, len(cluster_info)):
|
||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||
if gap > median_pitch * BREAK_FACTOR:
|
||
sections.append(current_section)
|
||
current_section = [cluster_info[i]]
|
||
else:
|
||
current_section.append(cluster_info[i])
|
||
|
||
if current_section:
|
||
sections.append(current_section)
|
||
|
||
# --- Step E: Build row boundaries per section ---
|
||
grid_rows: List[RowGeometry] = []
|
||
|
||
for section in sections:
|
||
if not section:
|
||
continue
|
||
|
||
if len(section) == 1:
|
||
# Single-line section (likely a heading)
|
||
cl = section[0]
|
||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||
row_top = cl['center_y_abs'] - half_h
|
||
row_bot = cl['center_y_abs'] + half_h
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
continue
|
||
|
||
# Compute local pitch for this section
|
||
local_pitches = []
|
||
for i in range(1, len(section)):
|
||
local_pitches.append(
|
||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||
)
|
||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||
|
||
# Row boundaries are placed at midpoints between consecutive centers.
|
||
# First row: top = center - local_pitch/2
|
||
# Last row: bottom = center + local_pitch/2
|
||
for i, cl in enumerate(section):
|
||
if i == 0:
|
||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||
else:
|
||
# Midpoint between this center and previous center
|
||
prev_center = section[i - 1]['center_y_abs']
|
||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||
|
||
if i == len(section) - 1:
|
||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||
else:
|
||
next_center = section[i + 1]['center_y_abs']
|
||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||
|
||
# Clamp to reasonable bounds
|
||
row_top = max(top_y, row_top)
|
||
row_bot = min(top_y + content_h, row_bot)
|
||
|
||
if row_bot - row_top < 5:
|
||
continue
|
||
|
||
grid_rows.append(RowGeometry(
|
||
index=0,
|
||
x=left_x,
|
||
y=round(row_top),
|
||
width=content_w,
|
||
height=round(row_bot - row_top),
|
||
word_count=len(cl['words']),
|
||
words=cl['words'],
|
||
row_type='content',
|
||
gap_before=0,
|
||
))
|
||
|
||
if not grid_rows:
|
||
return rows
|
||
|
||
# --- Step F: Re-assign words to grid rows ---
|
||
# Words may have shifted slightly; assign each word to the row whose
|
||
# center is closest to the word's vertical center.
|
||
for gr in grid_rows:
|
||
gr.words = []
|
||
|
||
for w in content_words:
|
||
w_center = w['top'] + top_y + w['height'] / 2
|
||
best_row = None
|
||
best_dist = float('inf')
|
||
for gr in grid_rows:
|
||
row_center = gr.y + gr.height / 2
|
||
dist = abs(w_center - row_center)
|
||
if dist < best_dist:
|
||
best_dist = dist
|
||
best_row = gr
|
||
if best_row is not None and best_dist < median_pitch:
|
||
best_row.words.append(w)
|
||
|
||
for gr in grid_rows:
|
||
gr.word_count = len(gr.words)
|
||
|
||
# --- Step G: Validate ---
|
||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||
if len(content_words) > 0:
|
||
match_ratio = words_placed / len(content_words)
|
||
if match_ratio < 0.85:
|
||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||
f"of words, keeping gap-based rows")
|
||
return rows
|
||
|
||
# Remove empty grid rows (no words assigned)
|
||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||
|
||
# --- Step H: Merge header/footer + re-index ---
|
||
result = list(non_content) + grid_rows
|
||
result.sort(key=lambda r: r.y)
|
||
for i, r in enumerate(result):
|
||
r.index = i
|
||
|
||
row_heights = [gr.height for gr in grid_rows]
|
||
min_h = min(row_heights) if row_heights else 0
|
||
max_h = max(row_heights) if row_heights else 0
|
||
logger.info(f"RowGrid: word-center grid applied "
|
||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||
f"{len(sections)} sections, "
|
||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||
f"was {len(content_rows)} gap-based rows)")
|
||
|
||
return result
|