Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s

sed replacement left orphaned hostname references in story page
and empty lines in getApiBase functions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-04-24 16:02:04 +02:00
parent b07f802c24
commit 9ba420fa91
150 changed files with 30231 additions and 32053 deletions

View File

@@ -0,0 +1,257 @@
"""
Legacy layout analysis using projection profiles.
Extracted from cv_layout_columns.py — contains:
- analyze_layout() (projection-profile based column/header/footer detection)
Lizenz: Apache 2.0 (kommerziell nutzbar)
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
"""
import logging
from typing import List
import numpy as np
from cv_vocab_types import PageRegion
from cv_layout_detection import _find_content_bounds
logger = logging.getLogger(__name__)
try:
import cv2
except ImportError:
cv2 = None # type: ignore[assignment]
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
"""Detect columns, header, and footer using projection profiles.
Uses content-bounds detection to exclude page margins before searching
for column separators within the actual text area.
Args:
layout_img: CLAHE-enhanced grayscale image.
ocr_img: Binarized image for text density analysis.
Returns:
List of PageRegion objects describing detected regions.
"""
h, w = ocr_img.shape[:2]
# Invert: black text on white → white text on black for projection
inv = cv2.bitwise_not(ocr_img)
# --- Find actual content bounds (exclude page margins) ---
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x
content_h = bottom_y - top_y
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
if content_w < w * 0.3 or content_h < h * 0.3:
# Fallback if detection seems wrong
left_x, right_x = 0, w
top_y, bottom_y = 0, h
content_w, content_h = w, h
# --- Vertical projection within content area to find column separators ---
content_strip = inv[top_y:bottom_y, left_x:right_x]
v_proj = np.sum(content_strip, axis=0).astype(float)
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
# Smooth the projection profile
kernel_size = max(5, content_w // 50)
if kernel_size % 2 == 0:
kernel_size += 1
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
# Debug: log projection profile statistics
p_mean = float(np.mean(v_proj_smooth))
p_median = float(np.median(v_proj_smooth))
p_min = float(np.min(v_proj_smooth))
p_max = float(np.max(v_proj_smooth))
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
f"mean={p_mean:.4f}, median={p_median:.4f}")
# Find valleys using multiple threshold strategies
# Strategy 1: relative to median (catches clear separators)
# Strategy 2: local minima approach (catches subtle gaps)
threshold = max(p_median * 0.3, p_mean * 0.2)
logger.info(f"Layout: valley threshold={threshold:.4f}")
in_valley = v_proj_smooth < threshold
# Find contiguous valley regions
all_valleys = []
start = None
for x in range(len(v_proj_smooth)):
if in_valley[x] and start is None:
start = x
elif not in_valley[x] and start is not None:
valley_width = x - start
valley_depth = float(np.min(v_proj_smooth[start:x]))
# Valley must be at least 3px wide
if valley_width >= 3:
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
start = None
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)}"
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
# Filter: valleys must be inside the content area (not at edges)
inner_margin = int(content_w * 0.08)
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
# If no valleys found with strict threshold, try local minima approach
if len(valleys) < 2:
logger.info("Layout: trying local minima approach for column detection")
# Divide content into 20 segments, find the 2 lowest
seg_count = 20
seg_width = content_w // seg_count
seg_scores = []
for i in range(seg_count):
sx = i * seg_width
ex = min((i + 1) * seg_width, content_w)
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
seg_scores.append((i, sx, ex, seg_mean))
seg_scores.sort(key=lambda s: s[3])
logger.info(f"Layout: segment scores (lowest 5): "
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
# Find two lowest non-adjacent segments that create reasonable columns
candidate_valleys = []
for seg_idx, sx, ex, seg_mean in seg_scores:
# Must not be at the edges
if seg_idx <= 1 or seg_idx >= seg_count - 2:
continue
# Must be significantly lower than overall mean
if seg_mean < p_mean * 0.6:
center = (sx + ex) // 2
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
if len(candidate_valleys) >= 2:
# Pick the best pair: non-adjacent, creating reasonable column widths
candidate_valleys.sort(key=lambda v: v[2])
best_pair = None
best_score = float('inf')
for i in range(len(candidate_valleys)):
for j in range(i + 1, len(candidate_valleys)):
c1 = candidate_valleys[i][2]
c2 = candidate_valleys[j][2]
# Must be at least 20% apart
if (c2 - c1) < content_w * 0.2:
continue
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
# Each column at least 15%
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
continue
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
if score < best_score:
best_score = score
best_pair = (candidate_valleys[i], candidate_valleys[j])
if best_pair:
valleys = list(best_pair)
logger.info(f"Layout: local minima found 2 valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
logger.info(f"Layout: final {len(valleys)} valleys: "
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
regions = []
if len(valleys) >= 2:
# 3-column layout detected
valleys.sort(key=lambda v: v[2])
if len(valleys) == 2:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
else:
# Pick the two valleys that best divide into 3 parts
# Prefer wider valleys (more likely true separators)
best_pair = None
best_score = float('inf')
for i in range(len(valleys)):
for j in range(i + 1, len(valleys)):
c1, c2 = valleys[i][2], valleys[j][2]
# Each column should be at least 15% of content width
col1 = c1
col2 = c2 - c1
col3 = content_w - c2
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
continue
# Score: lower is better (more even distribution)
parts = sorted([col1, col2, col3])
score = parts[2] - parts[0]
# Bonus for wider valleys (subtract valley width)
score -= (valleys[i][3] + valleys[j][3]) * 0.5
if score < best_score:
best_score = score
best_pair = (c1, c2)
if best_pair:
sep1_center, sep2_center = best_pair
else:
sep1_center = valleys[0][2]
sep2_center = valleys[1][2]
# Convert from content-relative to absolute coordinates
abs_sep1 = sep1_center + left_x
abs_sep2 = sep2_center + left_x
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep1, y=top_y,
width=abs_sep2 - abs_sep1, height=content_h
))
regions.append(PageRegion(
type='column_example', x=abs_sep2, y=top_y,
width=w - abs_sep2, height=content_h
))
elif len(valleys) == 1:
# 2-column layout
abs_sep = valleys[0][2] + left_x
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=abs_sep, height=content_h
))
regions.append(PageRegion(
type='column_de', x=abs_sep, y=top_y,
width=w - abs_sep, height=content_h
))
else:
# No columns detected — run full-page OCR as single column
logger.warning("Layout: no column separators found, using full page")
regions.append(PageRegion(
type='column_en', x=0, y=top_y,
width=w, height=content_h
))
# Add header/footer info (gap-based detection with fallback)
# Lazy import to avoid circular dependency with cv_layout.py
from cv_layout_detection import _add_header_footer
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
col_count = len([r for r in regions if r.type.startswith('column')])
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
return regions