Fix: Remove broken getKlausurApiUrl and clean up empty lines

sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions
--- a/klausur-service/backend/cv_layout_row_regularize.py
+++ b/klausur-service/backend/cv_layout_row_regularize.py
@@ -0,0 +1,329 @@
+"""
+Row grid regularization for document layout analysis.
+
+Provides word-center-based row boundary refinement to improve
+gap-based row detection. Extracted from cv_layout_rows.py.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Dict, List
+
+import numpy as np
+
+from cv_vocab_types import RowGeometry
+
+logger = logging.getLogger(__name__)
+
+
+def _regularize_row_grid(
+    rows: List['RowGeometry'],
+    word_dicts: List[Dict],
+    left_x: int, right_x: int,
+    top_y: int,
+    content_w: int, content_h: int,
+    inv: np.ndarray,
+) -> List['RowGeometry']:
+    """Rebuild row boundaries from word center-lines with section-break awareness.
+
+    Instead of overlaying a rigid grid, this derives row positions bottom-up
+    from the words themselves:
+
+    Step A: Group all content words into line clusters by Y-proximity.
+        Tolerance = 40% of median gap-based row height.
+    Step B: For each cluster compute:
+        - center_y = median of (word_top + word_height/2) for all words
+        - letter_h = median of word heights (excluding outliers > 2× median)
+    Step B2: Merge clusters whose centers are closer than 30% of row height
+        (spurious splits from OCR jitter).
+    Step C: Compute pitches (distances between consecutive centers).
+        Detect section breaks where gap > 1.8× median pitch.
+    Step D: Split clusters into sections at the section breaks.
+    Step E: Within each section, place row boundaries at midpoints between
+        consecutive line centers:
+        - First row top = center - local_pitch/2
+        - Last row bottom = center + local_pitch/2
+        - Interior boundaries = (center_i + center_{i+1}) / 2
+        This ensures rows tile seamlessly without gaps or overlaps.
+    Step F: Re-assign words to the nearest grid row by vertical center distance.
+    Step G: Validate that >= 85% of words land in a grid row; otherwise
+        fall back to the original gap-based rows.
+    Step H: Merge with preserved header/footer rows and re-index.
+
+    Guard: Requires >= 5 content rows from gap-based detection to activate.
+    This prevents the regularizer from running on very small images (e.g.
+    box sub-sessions with only 3-6 rows) where the gap-based detection
+    is already accurate enough.
+
+    Header/footer rows from the gap-based detection are preserved.
+    """
+    content_rows = [r for r in rows if r.row_type == 'content']
+    non_content = [r for r in rows if r.row_type != 'content']
+
+    if len(content_rows) < 5:
+        return rows
+
+    # --- Step A: Group ALL words into line clusters ---
+    # Collect words that belong to content rows (deduplicated)
+    content_words: List[Dict] = []
+    seen_keys: set = set()
+    for r in content_rows:
+        for w in r.words:
+            key = (w['left'], w['top'], w['width'], w['height'])
+            if key not in seen_keys:
+                seen_keys.add(key)
+                content_words.append(w)
+
+    if len(content_words) < 5:
+        return rows
+
+    # Compute median word height (excluding outliers like tall brackets/IPA)
+    word_heights = sorted(w['height'] for w in content_words)
+    median_wh = word_heights[len(word_heights) // 2]
+
+    # Compute median gap-based row height — this is the actual line height
+    # as detected by the horizontal projection.  We use 40% of this as
+    # grouping tolerance.  This is much more reliable than using word height
+    # alone, because words on the same line can have very different heights
+    # (e.g. lowercase vs uppercase, brackets, phonetic symbols).
+    gap_row_heights = sorted(r.height for r in content_rows)
+    median_row_h = gap_row_heights[len(gap_row_heights) // 2]
+
+    # Tolerance: 40% of row height.  Words on the same line should have
+    # centers within this range.  Even if a word's bbox is taller/shorter,
+    # its center should stay within half a row height of the line center.
+    y_tol = max(10, int(median_row_h * 0.4))
+
+    # Sort by center_y, then group by proximity
+    words_by_center = sorted(content_words,
+                             key=lambda w: (w['top'] + w['height'] / 2, w['left']))
+    line_clusters: List[List[Dict]] = []
+    current_line: List[Dict] = [words_by_center[0]]
+    current_center = words_by_center[0]['top'] + words_by_center[0]['height'] / 2
+
+    for w in words_by_center[1:]:
+        w_center = w['top'] + w['height'] / 2
+        if abs(w_center - current_center) <= y_tol:
+            current_line.append(w)
+        else:
+            current_line.sort(key=lambda w: w['left'])
+            line_clusters.append(current_line)
+            current_line = [w]
+            current_center = w_center
+
+    if current_line:
+        current_line.sort(key=lambda w: w['left'])
+        line_clusters.append(current_line)
+
+    if len(line_clusters) < 3:
+        return rows
+
+    # --- Step B: Compute center_y per cluster ---
+    # center_y = median of (word_top + word_height/2) across all words in cluster
+    # letter_h = median of word heights, but excluding outlier-height words
+    #            (>2× median) so that tall brackets/IPA don't skew the height
+    cluster_info: List[Dict] = []
+    for cl_words in line_clusters:
+        centers = [w['top'] + w['height'] / 2 for w in cl_words]
+        # Filter outlier heights for letter_h computation
+        normal_heights = [w['height'] for w in cl_words
+                          if w['height'] <= median_wh * 2.0]
+        if not normal_heights:
+            normal_heights = [w['height'] for w in cl_words]
+        center_y = float(np.median(centers))
+        letter_h = float(np.median(normal_heights))
+        cluster_info.append({
+            'center_y_rel': center_y,  # relative to content ROI
+            'center_y_abs': center_y + top_y,  # absolute
+            'letter_h': letter_h,
+            'words': cl_words,
+        })
+
+    cluster_info.sort(key=lambda c: c['center_y_rel'])
+
+    # --- Step B2: Merge clusters that are too close together ---
+    # Even with center-based grouping, some edge cases can produce
+    # spurious clusters.  Merge any pair whose centers are closer
+    # than 30% of the row height (they're definitely the same text line).
+    merge_threshold = max(8, median_row_h * 0.3)
+    merged: List[Dict] = [cluster_info[0]]
+    for cl in cluster_info[1:]:
+        prev = merged[-1]
+        if cl['center_y_rel'] - prev['center_y_rel'] < merge_threshold:
+            # Merge: combine words, recompute center
+            combined_words = prev['words'] + cl['words']
+            centers = [w['top'] + w['height'] / 2 for w in combined_words]
+            normal_heights = [w['height'] for w in combined_words
+                              if w['height'] <= median_wh * 2.0]
+            if not normal_heights:
+                normal_heights = [w['height'] for w in combined_words]
+            prev['center_y_rel'] = float(np.median(centers))
+            prev['center_y_abs'] = prev['center_y_rel'] + top_y
+            prev['letter_h'] = float(np.median(normal_heights))
+            prev['words'] = combined_words
+        else:
+            merged.append(cl)
+
+    cluster_info = merged
+
+    if len(cluster_info) < 3:
+        return rows
+
+    # --- Step C: Compute pitches and detect section breaks ---
+    pitches: List[float] = []
+    for i in range(1, len(cluster_info)):
+        pitch = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        pitches.append(pitch)
+
+    if not pitches:
+        return rows
+
+    median_pitch = float(np.median(pitches))
+    if median_pitch <= 5:
+        return rows
+
+    # A section break is where the gap between line centers is much larger
+    # than the normal pitch (sub-headings, section titles, etc.)
+    BREAK_FACTOR = 1.8
+
+    # --- Step D: Build sections (groups of consecutive lines with normal spacing) ---
+    sections: List[List[Dict]] = []
+    current_section: List[Dict] = [cluster_info[0]]
+
+    for i in range(1, len(cluster_info)):
+        gap = cluster_info[i]['center_y_rel'] - cluster_info[i - 1]['center_y_rel']
+        if gap > median_pitch * BREAK_FACTOR:
+            sections.append(current_section)
+            current_section = [cluster_info[i]]
+        else:
+            current_section.append(cluster_info[i])
+
+    if current_section:
+        sections.append(current_section)
+
+    # --- Step E: Build row boundaries per section ---
+    grid_rows: List[RowGeometry] = []
+
+    for section in sections:
+        if not section:
+            continue
+
+        if len(section) == 1:
+            # Single-line section (likely a heading)
+            cl = section[0]
+            half_h = max(cl['letter_h'], median_pitch * 0.4)
+            row_top = cl['center_y_abs'] - half_h
+            row_bot = cl['center_y_abs'] + half_h
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+            continue
+
+        # Compute local pitch for this section
+        local_pitches = []
+        for i in range(1, len(section)):
+            local_pitches.append(
+                section[i]['center_y_rel'] - section[i - 1]['center_y_rel']
+            )
+        local_pitch = float(np.median(local_pitches)) if local_pitches else median_pitch
+
+        # Row boundaries are placed at midpoints between consecutive centers.
+        # First row: top = center - local_pitch/2
+        # Last row: bottom = center + local_pitch/2
+        for i, cl in enumerate(section):
+            if i == 0:
+                row_top = cl['center_y_abs'] - local_pitch / 2
+            else:
+                # Midpoint between this center and previous center
+                prev_center = section[i - 1]['center_y_abs']
+                row_top = (prev_center + cl['center_y_abs']) / 2
+
+            if i == len(section) - 1:
+                row_bot = cl['center_y_abs'] + local_pitch / 2
+            else:
+                next_center = section[i + 1]['center_y_abs']
+                row_bot = (cl['center_y_abs'] + next_center) / 2
+
+            # Clamp to reasonable bounds
+            row_top = max(top_y, row_top)
+            row_bot = min(top_y + content_h, row_bot)
+
+            if row_bot - row_top < 5:
+                continue
+
+            grid_rows.append(RowGeometry(
+                index=0,
+                x=left_x,
+                y=round(row_top),
+                width=content_w,
+                height=round(row_bot - row_top),
+                word_count=len(cl['words']),
+                words=cl['words'],
+                row_type='content',
+                gap_before=0,
+            ))
+
+    if not grid_rows:
+        return rows
+
+    # --- Step F: Re-assign words to grid rows ---
+    # Words may have shifted slightly; assign each word to the row whose
+    # center is closest to the word's vertical center.
+    for gr in grid_rows:
+        gr.words = []
+
+    for w in content_words:
+        w_center = w['top'] + top_y + w['height'] / 2
+        best_row = None
+        best_dist = float('inf')
+        for gr in grid_rows:
+            row_center = gr.y + gr.height / 2
+            dist = abs(w_center - row_center)
+            if dist < best_dist:
+                best_dist = dist
+                best_row = gr
+        if best_row is not None and best_dist < median_pitch:
+            best_row.words.append(w)
+
+    for gr in grid_rows:
+        gr.word_count = len(gr.words)
+
+    # --- Step G: Validate ---
+    words_placed = sum(gr.word_count for gr in grid_rows)
+    if len(content_words) > 0:
+        match_ratio = words_placed / len(content_words)
+        if match_ratio < 0.85:
+            logger.info(f"RowGrid: word-center grid only matches {match_ratio:.0%} "
+                        f"of words, keeping gap-based rows")
+            return rows
+
+    # Remove empty grid rows (no words assigned)
+    grid_rows = [gr for gr in grid_rows if gr.word_count > 0]
+
+    # --- Step H: Merge header/footer + re-index ---
+    result = list(non_content) + grid_rows
+    result.sort(key=lambda r: r.y)
+    for i, r in enumerate(result):
+        r.index = i
+
+    row_heights = [gr.height for gr in grid_rows]
+    min_h = min(row_heights) if row_heights else 0
+    max_h = max(row_heights) if row_heights else 0
+    logger.info(f"RowGrid: word-center grid applied "
+                f"(median_pitch={median_pitch:.0f}px, median_row_h={median_row_h}px, median_wh={median_wh}px, "
+                f"y_tol={y_tol}px, {len(line_clusters)} clusters→{len(cluster_info)} merged, "
+                f"{len(sections)} sections, "
+                f"{len(grid_rows)} grid rows [h={min_h}-{max_h}px], "
+                f"was {len(content_rows)} gap-based rows)")
+
+    return result