Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,329 @@
"""
Row grid regularization for document layout analysis.
Provides word-center-based row boundary refinement to improve
gap-based row detection. Extracted from cv_layout_rows.py.
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import Dict, List
import numpy as np
from cv_vocab_types import RowGeometry
logger = logging.getLogger(__name__)
def _regularize_row_grid(
rows: List['RowGeometry'],
word_dicts: List[Dict],
left_x: int, right_x: int,
top_y: int,
content_w: int, content_h: int,
inv: np.ndarray,
) -> List['RowGeometry']:
"""Rebuild row boundaries from word center-lines with section-break awareness.
Instead of overlaying a rigid grid, this derives row positions bottom-up
from the words themselves:
Step A: Group all content words into line clusters by Y-proximity.
Tolerance = 40% of median gap-based row height.
Step B: For each cluster compute:
- center_y = median of (word_top + word_height/2) for all words
- letter_h = median of word heights (excluding outliers > 2× median)
Step B2: Merge clusters whose centers are closer than 30% of row height
(spurious splits from OCR jitter).
Step C: Compute pitches (distances between consecutive centers).
Detect section breaks where gap > 1.8× median pitch.
Step D: Split clusters into sections at the section breaks.
Step E: Within each section, place row boundaries at midpoints between
consecutive line centers:
- First row top = center - local_pitch/2
- Last row bottom = center + local_pitch/2
- Interior boundaries = (center_i + center_{i+1}) / 2
This ensures rows tile seamlessly without gaps or overlaps.
Step F: Re-assign words to the nearest grid row by vertical center distance.
Step G: Validate that >= 85% of words land in a grid row; otherwise
fall back to the original gap-based rows.
Step H: Merge with preserved header/footer rows and re-index.
Guard: Requires >= 5 content rows from gap-based detection to activate.
This prevents the regularizer from running on very small images (e.g.
box sub-sessions with only 3-6 rows) where the gap-based detection
is already accurate enough.
Header/footer rows from the gap-based detection are preserved.
"""
content_rows = [r for r in rows if r.row_type == 'content']
non_content = [r for r in rows if r.row_type != 'content']
if len(content_rows) < 5:
return rows
# --- Step A: Group ALL words into line clusters ---
# Collect words that belong to content rows (deduplicated)
content_words: List[Dict] = []
seen_keys: set = set()
for r in content_rows:
for w in r.words:
key = (w['left'], w['top'], w['width'], w['height'])
if key not in seen_keys:
seen_keys.add(key)
content_words.append(w)
if len(content_words) < 5:
return rows
# Compute median word height (excluding outliers like tall brackets/IPA)
word_heights = sorted(w['height'] for w in content_words)
median_wh = word_heights[len(word_heights) // 2]
# Compute median gap-based row height — this is the actual line height
# as detected by the horizontal projection. We use 40% of this as
# grouping tolerance. This is much more reliable than using word height
# alone, because words on the same line can have very different heights
# (e.g. lowercase vs uppercase, brackets, phonetic symbols).
gap_row_heights = sorted(r.height for r in content_rows)
median_row_h = gap_row_heights[len(gap_row_heights) // 2]
# Tolerance: 40% of row height. Words on the same line should have
# centers within this range. Even if a word's bbox is taller/shorter,
# its center should stay within half a row height of the line center.
y_tol = max(10, int(median_row_h * 0.4))
# Sort by center_y, then group by proximity
words_by_center = sorted(content_words,
key=lambda w: (w['top'] + w['height'] / 2, w['left']))
line_clusters: List[List[Dict]] = []
current_line: List[Dict] = [words_by_center[0]]
current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
for w in words_by_center[1:]:
w_center = w['top'] + w['height'] / 2
if abs(w_center - current_center) <= y_tol:
current_line.append(w)
else:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
current_line = [w]
current_center = w_center
if current_line:
current_line.sort(key=lambda w: w['left'])
line_clusters.append(current_line)
if len(line_clusters) < 3:
return rows
# --- Step B: Compute center_y per cluster ---
# center_y = median of (word_top + word_height/2) across all words in cluster
# letter_h = median of word heights, but excluding outlier-height words
# (>2× median) so that tall brackets/IPA don't skew the height
cluster_info: List[Dict] = []
for cl_words in line_clusters:
centers = [w['top'] + w['height'] / 2 for w in cl_words]
# Filter outlier heights for letter_h computation
normal_heights = [w['height'] for w in cl_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in cl_words]
center_y = float(np.median(centers))
letter_h = float(np.median(normal_heights))
cluster_info.append({
'center_y_rel': center_y, # relative to content ROI
'center_y_abs': center_y + top_y, # absolute
'letter_h': letter_h,
'words': cl_words,
})
cluster_info.sort(key=lambda c: c['center_y_rel'])
# --- Step B2: Merge clusters that are too close together ---
# Even with center-based grouping, some edge cases can produce
# spurious clusters. Merge any pair whose centers are closer
# than 30% of the row height (they're definitely the same text line).
merge_threshold = max(8, median_row_h * 0.3)
merged: List[Dict] = [cluster_info[0]]
for cl in cluster_info[1:]:
prev = merged[-1]
if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
# Merge: combine words, recompute center
combined_words = prev['words'] + cl['words']
centers = [w['top'] + w['height'] / 2 for w in combined_words]
normal_heights = [w['height'] for w in combined_words
if w['height'] <= median_wh * 2.0]
if not normal_heights:
normal_heights = [w['height'] for w in combined_words]
prev['center_y_rel'] = float(np.median(centers))
prev['center_y_abs'] = prev['center_y_rel'] + top_y
prev['letter_h'] = float(np.median(normal_heights))
prev['words'] = combined_words
else:
merged.append(cl)
cluster_info = merged
if len(cluster_info) < 3:
return rows
# --- Step C: Compute pitches and detect section breaks ---
pitches: List[float] = []
for i in range(1, len(cluster_info)):
pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
pitches.append(pitch)
if not pitches:
return rows
median_pitch = float(np.median(pitches))
if median_pitch <= 5:
return rows
# A section break is where the gap between line centers is much larger
# than the normal pitch (sub-headings, section titles, etc.)
BREAK_FACTOR = 1.8
# --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
sections: List[List[Dict]] = []
current_section: List[Dict] = [cluster_info[0]]
for i in range(1, len(cluster_info)):
gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
if gap > median_pitch * BREAK_FACTOR:
sections.append(current_section)
current_section = [cluster_info[i]]
else:
current_section.append(cluster_info[i])
if current_section:
sections.append(current_section)
# --- Step E: Build row boundaries per section ---
grid_rows: List[RowGeometry] = []
for section in sections:
if not section:
continue
if len(section) == 1:
# Single-line section (likely a heading)
cl = section[0]
half_h = max(cl['letter_h'], median_pitch * 0.4)
row_top = cl['center_y_abs'] - half_h
row_bot = cl['center_y_abs'] + half_h
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
continue
# Compute local pitch for this section
local_pitches = []
for i in range(1, len(section)):
local_pitches.append(
section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
)
local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
# Row boundaries are placed at midpoints between consecutive centers.
# First row: top = center - local_pitch/2
# Last row: bottom = center + local_pitch/2
for i, cl in enumerate(section):
if i == 0:
row_top = cl['center_y_abs'] - local_pitch / 2
else:
# Midpoint between this center and previous center
prev_center = section[i - 1]['center_y_abs']
row_top = (prev_center + cl['center_y_abs']) / 2
if i == len(section) - 1:
row_bot = cl['center_y_abs'] + local_pitch / 2
else:
next_center = section[i + 1]['center_y_abs']
row_bot = (cl['center_y_abs'] + next_center) / 2
# Clamp to reasonable bounds
row_top = max(top_y, row_top)
row_bot = min(top_y + content_h, row_bot)
if row_bot - row_top < 5:
continue
grid_rows.append(RowGeometry(
index=0,
x=left_x,
y=round(row_top),
width=content_w,
height=round(row_bot - row_top),
word_count=len(cl['words']),
words=cl['words'],
row_type='content',
gap_before=0,
))
if not grid_rows:
return rows
# --- Step F: Re-assign words to grid rows ---
# Words may have shifted slightly; assign each word to the row whose
# center is closest to the word's vertical center.
for gr in grid_rows:
gr.words = []
for w in content_words:
w_center = w['top'] + top_y + w['height'] / 2
best_row = None
best_dist = float('inf')
for gr in grid_rows:
row_center = gr.y + gr.height / 2
dist = abs(w_center - row_center)
if dist < best_dist:
best_dist = dist
best_row = gr
if best_row is not None and best_dist < median_pitch:
best_row.words.append(w)
for gr in grid_rows:
gr.word_count = len(gr.words)
# --- Step G: Validate ---
words_placed = sum(gr.word_count for gr in grid_rows)
if len(content_words) > 0:
match_ratio = words_placed / len(content_words)
if match_ratio < 0.85:
logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
f"of words, keeping gap-based rows")
return rows
# Remove empty grid rows (no words assigned)
grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
# --- Step H: Merge header/footer + re-index ---
result = list(non_content) + grid_rows
result.sort(key=lambda r: r.y)
for i, r in enumerate(result):
r.index = i
row_heights = [gr.height for gr in grid_rows]
min_h = min(row_heights) if row_heights else 0
max_h = max(row_heights) if row_heights else 0
logger.info(f"RowGrid: word-center grid applied "
f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
f"{len(sections)} sections, "
f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
f"was {len(content_rows)} gap-based rows)")
return result