Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
329
klausur-service/backend/cv_layout_row_regularize.py
Normal file
329
klausur-service/backend/cv_layout_row_regularize.py
Normal file
@@ -0,0 +1,329 @@
|
||||
"""
|
||||
Row grid regularization for document layout analysis.
|
||||
|
||||
Provides word-center-based row boundary refinement to improve
|
||||
gap-based row detection. Extracted from cv_layout_rows.py.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import RowGeometry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _regularize_row_grid(
|
||||
rows: List['RowGeometry'],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int, right_x: int,
|
||||
top_y: int,
|
||||
content_w: int, content_h: int,
|
||||
inv: np.ndarray,
|
||||
) -> List['RowGeometry']:
|
||||
"""Rebuild row boundaries from word center-lines with section-break awareness.
|
||||
|
||||
Instead of overlaying a rigid grid, this derives row positions bottom-up
|
||||
from the words themselves:
|
||||
|
||||
Step A: Group all content words into line clusters by Y-proximity.
|
||||
Tolerance = 40% of median gap-based row height.
|
||||
Step B: For each cluster compute:
|
||||
- center_y = median of (word_top + word_height/2) for all words
|
||||
- letter_h = median of word heights (excluding outliers > 2× median)
|
||||
Step B2: Merge clusters whose centers are closer than 30% of row height
|
||||
(spurious splits from OCR jitter).
|
||||
Step C: Compute pitches (distances between consecutive centers).
|
||||
Detect section breaks where gap > 1.8× median pitch.
|
||||
Step D: Split clusters into sections at the section breaks.
|
||||
Step E: Within each section, place row boundaries at midpoints between
|
||||
consecutive line centers:
|
||||
- First row top = center - local_pitch/2
|
||||
- Last row bottom = center + local_pitch/2
|
||||
- Interior boundaries = (center_i + center_{i+1}) / 2
|
||||
This ensures rows tile seamlessly without gaps or overlaps.
|
||||
Step F: Re-assign words to the nearest grid row by vertical center distance.
|
||||
Step G: Validate that >= 85% of words land in a grid row; otherwise
|
||||
fall back to the original gap-based rows.
|
||||
Step H: Merge with preserved header/footer rows and re-index.
|
||||
|
||||
Guard: Requires >= 5 content rows from gap-based detection to activate.
|
||||
This prevents the regularizer from running on very small images (e.g.
|
||||
box sub-sessions with only 3-6 rows) where the gap-based detection
|
||||
is already accurate enough.
|
||||
|
||||
Header/footer rows from the gap-based detection are preserved.
|
||||
"""
|
||||
content_rows = [r for r in rows if r.row_type == 'content']
|
||||
non_content = [r for r in rows if r.row_type != 'content']
|
||||
|
||||
if len(content_rows) < 5:
|
||||
return rows
|
||||
|
||||
# --- Step A: Group ALL words into line clusters ---
|
||||
# Collect words that belong to content rows (deduplicated)
|
||||
content_words: List[Dict] = []
|
||||
seen_keys: set = set()
|
||||
for r in content_rows:
|
||||
for w in r.words:
|
||||
key = (w['left'], w['top'], w['width'], w['height'])
|
||||
if key not in seen_keys:
|
||||
seen_keys.add(key)
|
||||
content_words.append(w)
|
||||
|
||||
if len(content_words) < 5:
|
||||
return rows
|
||||
|
||||
# Compute median word height (excluding outliers like tall brackets/IPA)
|
||||
word_heights = sorted(w['height'] for w in content_words)
|
||||
median_wh = word_heights[len(word_heights) // 2]
|
||||
|
||||
# Compute median gap-based row height — this is the actual line height
|
||||
# as detected by the horizontal projection. We use 40% of this as
|
||||
# grouping tolerance. This is much more reliable than using word height
|
||||
# alone, because words on the same line can have very different heights
|
||||
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
|
||||
gap_row_heights = sorted(r.height for r in content_rows)
|
||||
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
|
||||
|
||||
# Tolerance: 40% of row height. Words on the same line should have
|
||||
# centers within this range. Even if a word's bbox is taller/shorter,
|
||||
# its center should stay within half a row height of the line center.
|
||||
y_tol = max(10, int(median_row_h * 0.4))
|
||||
|
||||
# Sort by center_y, then group by proximity
|
||||
words_by_center = sorted(content_words,
|
||||
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
|
||||
line_clusters: List[List[Dict]] = []
|
||||
current_line: List[Dict] = [words_by_center[0]]
|
||||
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
|
||||
|
||||
for w in words_by_center[1:]:
|
||||
w_center = w['top'] + w['height'] / 2
|
||||
if abs(w_center - current_center) <= y_tol:
|
||||
current_line.append(w)
|
||||
else:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
current_line = [w]
|
||||
current_center = w_center
|
||||
|
||||
if current_line:
|
||||
current_line.sort(key=lambda w: w['left'])
|
||||
line_clusters.append(current_line)
|
||||
|
||||
if len(line_clusters) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step B: Compute center_y per cluster ---
|
||||
# center_y = median of (word_top + word_height/2) across all words in cluster
|
||||
# letter_h = median of word heights, but excluding outlier-height words
|
||||
# (>2× median) so that tall brackets/IPA don't skew the height
|
||||
cluster_info: List[Dict] = []
|
||||
for cl_words in line_clusters:
|
||||
centers = [w['top'] + w['height'] / 2 for w in cl_words]
|
||||
# Filter outlier heights for letter_h computation
|
||||
normal_heights = [w['height'] for w in cl_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in cl_words]
|
||||
center_y = float(np.median(centers))
|
||||
letter_h = float(np.median(normal_heights))
|
||||
cluster_info.append({
|
||||
'center_y_rel': center_y, # relative to content ROI
|
||||
'center_y_abs': center_y + top_y, # absolute
|
||||
'letter_h': letter_h,
|
||||
'words': cl_words,
|
||||
})
|
||||
|
||||
cluster_info.sort(key=lambda c: c['center_y_rel'])
|
||||
|
||||
# --- Step B2: Merge clusters that are too close together ---
|
||||
# Even with center-based grouping, some edge cases can produce
|
||||
# spurious clusters. Merge any pair whose centers are closer
|
||||
# than 30% of the row height (they're definitely the same text line).
|
||||
merge_threshold = max(8, median_row_h * 0.3)
|
||||
merged: List[Dict] = [cluster_info[0]]
|
||||
for cl in cluster_info[1:]:
|
||||
prev = merged[-1]
|
||||
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
|
||||
# Merge: combine words, recompute center
|
||||
combined_words = prev['words'] + cl['words']
|
||||
centers = [w['top'] + w['height'] / 2 for w in combined_words]
|
||||
normal_heights = [w['height'] for w in combined_words
|
||||
if w['height'] <= median_wh * 2.0]
|
||||
if not normal_heights:
|
||||
normal_heights = [w['height'] for w in combined_words]
|
||||
prev['center_y_rel'] = float(np.median(centers))
|
||||
prev['center_y_abs'] = prev['center_y_rel'] + top_y
|
||||
prev['letter_h'] = float(np.median(normal_heights))
|
||||
prev['words'] = combined_words
|
||||
else:
|
||||
merged.append(cl)
|
||||
|
||||
cluster_info = merged
|
||||
|
||||
if len(cluster_info) < 3:
|
||||
return rows
|
||||
|
||||
# --- Step C: Compute pitches and detect section breaks ---
|
||||
pitches: List[float] = []
|
||||
for i in range(1, len(cluster_info)):
|
||||
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
pitches.append(pitch)
|
||||
|
||||
if not pitches:
|
||||
return rows
|
||||
|
||||
median_pitch = float(np.median(pitches))
|
||||
if median_pitch <= 5:
|
||||
return rows
|
||||
|
||||
# A section break is where the gap between line centers is much larger
|
||||
# than the normal pitch (sub-headings, section titles, etc.)
|
||||
BREAK_FACTOR = 1.8
|
||||
|
||||
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
|
||||
sections: List[List[Dict]] = []
|
||||
current_section: List[Dict] = [cluster_info[0]]
|
||||
|
||||
for i in range(1, len(cluster_info)):
|
||||
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
|
||||
if gap > median_pitch * BREAK_FACTOR:
|
||||
sections.append(current_section)
|
||||
current_section = [cluster_info[i]]
|
||||
else:
|
||||
current_section.append(cluster_info[i])
|
||||
|
||||
if current_section:
|
||||
sections.append(current_section)
|
||||
|
||||
# --- Step E: Build row boundaries per section ---
|
||||
grid_rows: List[RowGeometry] = []
|
||||
|
||||
for section in sections:
|
||||
if not section:
|
||||
continue
|
||||
|
||||
if len(section) == 1:
|
||||
# Single-line section (likely a heading)
|
||||
cl = section[0]
|
||||
half_h = max(cl['letter_h'], median_pitch * 0.4)
|
||||
row_top = cl['center_y_abs'] - half_h
|
||||
row_bot = cl['center_y_abs'] + half_h
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
continue
|
||||
|
||||
# Compute local pitch for this section
|
||||
local_pitches = []
|
||||
for i in range(1, len(section)):
|
||||
local_pitches.append(
|
||||
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
|
||||
)
|
||||
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
|
||||
|
||||
# Row boundaries are placed at midpoints between consecutive centers.
|
||||
# First row: top = center - local_pitch/2
|
||||
# Last row: bottom = center + local_pitch/2
|
||||
for i, cl in enumerate(section):
|
||||
if i == 0:
|
||||
row_top = cl['center_y_abs'] - local_pitch / 2
|
||||
else:
|
||||
# Midpoint between this center and previous center
|
||||
prev_center = section[i - 1]['center_y_abs']
|
||||
row_top = (prev_center + cl['center_y_abs']) / 2
|
||||
|
||||
if i == len(section) - 1:
|
||||
row_bot = cl['center_y_abs'] + local_pitch / 2
|
||||
else:
|
||||
next_center = section[i + 1]['center_y_abs']
|
||||
row_bot = (cl['center_y_abs'] + next_center) / 2
|
||||
|
||||
# Clamp to reasonable bounds
|
||||
row_top = max(top_y, row_top)
|
||||
row_bot = min(top_y + content_h, row_bot)
|
||||
|
||||
if row_bot - row_top < 5:
|
||||
continue
|
||||
|
||||
grid_rows.append(RowGeometry(
|
||||
index=0,
|
||||
x=left_x,
|
||||
y=round(row_top),
|
||||
width=content_w,
|
||||
height=round(row_bot - row_top),
|
||||
word_count=len(cl['words']),
|
||||
words=cl['words'],
|
||||
row_type='content',
|
||||
gap_before=0,
|
||||
))
|
||||
|
||||
if not grid_rows:
|
||||
return rows
|
||||
|
||||
# --- Step F: Re-assign words to grid rows ---
|
||||
# Words may have shifted slightly; assign each word to the row whose
|
||||
# center is closest to the word's vertical center.
|
||||
for gr in grid_rows:
|
||||
gr.words = []
|
||||
|
||||
for w in content_words:
|
||||
w_center = w['top'] + top_y + w['height'] / 2
|
||||
best_row = None
|
||||
best_dist = float('inf')
|
||||
for gr in grid_rows:
|
||||
row_center = gr.y + gr.height / 2
|
||||
dist = abs(w_center - row_center)
|
||||
if dist < best_dist:
|
||||
best_dist = dist
|
||||
best_row = gr
|
||||
if best_row is not None and best_dist < median_pitch:
|
||||
best_row.words.append(w)
|
||||
|
||||
for gr in grid_rows:
|
||||
gr.word_count = len(gr.words)
|
||||
|
||||
# --- Step G: Validate ---
|
||||
words_placed = sum(gr.word_count for gr in grid_rows)
|
||||
if len(content_words) > 0:
|
||||
match_ratio = words_placed / len(content_words)
|
||||
if match_ratio < 0.85:
|
||||
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
|
||||
f"of words, keeping gap-based rows")
|
||||
return rows
|
||||
|
||||
# Remove empty grid rows (no words assigned)
|
||||
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
|
||||
|
||||
# --- Step H: Merge header/footer + re-index ---
|
||||
result = list(non_content) + grid_rows
|
||||
result.sort(key=lambda r: r.y)
|
||||
for i, r in enumerate(result):
|
||||
r.index = i
|
||||
|
||||
row_heights = [gr.height for gr in grid_rows]
|
||||
min_h = min(row_heights) if row_heights else 0
|
||||
max_h = max(row_heights) if row_heights else 0
|
||||
logger.info(f"RowGrid: word-center grid applied "
|
||||
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
|
||||
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
|
||||
f"{len(sections)} sections, "
|
||||
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
|
||||
f"was {len(content_rows)} gap-based rows)")
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user