Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
257
klausur-service/backend/cv_layout_analyze.py
Normal file
257
klausur-service/backend/cv_layout_analyze.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Legacy layout analysis using projection profiles.
|
||||
|
||||
Extracted from cv_layout_columns.py — contains:
|
||||
- analyze_layout() (projection-profile based column/header/footer detection)
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cv_vocab_types import PageRegion
|
||||
from cv_layout_detection import _find_content_bounds
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except ImportError:
|
||||
cv2 = None # type: ignore[assignment]
|
||||
|
||||
|
||||
def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegion]:
|
||||
"""Detect columns, header, and footer using projection profiles.
|
||||
|
||||
Uses content-bounds detection to exclude page margins before searching
|
||||
for column separators within the actual text area.
|
||||
|
||||
Args:
|
||||
layout_img: CLAHE-enhanced grayscale image.
|
||||
ocr_img: Binarized image for text density analysis.
|
||||
|
||||
Returns:
|
||||
List of PageRegion objects describing detected regions.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# Invert: black text on white → white text on black for projection
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
|
||||
# --- Find actual content bounds (exclude page margins) ---
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
content_h = bottom_y - top_y
|
||||
|
||||
logger.info(f"Layout: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px) in {w}x{h} image")
|
||||
|
||||
if content_w < w * 0.3 or content_h < h * 0.3:
|
||||
# Fallback if detection seems wrong
|
||||
left_x, right_x = 0, w
|
||||
top_y, bottom_y = 0, h
|
||||
content_w, content_h = w, h
|
||||
|
||||
# --- Vertical projection within content area to find column separators ---
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||||
|
||||
# Smooth the projection profile
|
||||
kernel_size = max(5, content_w // 50)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1
|
||||
v_proj_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
# Debug: log projection profile statistics
|
||||
p_mean = float(np.mean(v_proj_smooth))
|
||||
p_median = float(np.median(v_proj_smooth))
|
||||
p_min = float(np.min(v_proj_smooth))
|
||||
p_max = float(np.max(v_proj_smooth))
|
||||
logger.info(f"Layout: v_proj stats — min={p_min:.4f}, max={p_max:.4f}, "
|
||||
f"mean={p_mean:.4f}, median={p_median:.4f}")
|
||||
|
||||
# Find valleys using multiple threshold strategies
|
||||
# Strategy 1: relative to median (catches clear separators)
|
||||
# Strategy 2: local minima approach (catches subtle gaps)
|
||||
threshold = max(p_median * 0.3, p_mean * 0.2)
|
||||
logger.info(f"Layout: valley threshold={threshold:.4f}")
|
||||
|
||||
in_valley = v_proj_smooth < threshold
|
||||
|
||||
# Find contiguous valley regions
|
||||
all_valleys = []
|
||||
start = None
|
||||
for x in range(len(v_proj_smooth)):
|
||||
if in_valley[x] and start is None:
|
||||
start = x
|
||||
elif not in_valley[x] and start is not None:
|
||||
valley_width = x - start
|
||||
valley_depth = float(np.min(v_proj_smooth[start:x]))
|
||||
# Valley must be at least 3px wide
|
||||
if valley_width >= 3:
|
||||
all_valleys.append((start, x, (start + x) // 2, valley_width, valley_depth))
|
||||
start = None
|
||||
|
||||
logger.info(f"Layout: raw valleys (before filter): {len(all_valleys)} — "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3], f'{v[4]:.4f}') for v in all_valleys[:10]]}")
|
||||
|
||||
# Filter: valleys must be inside the content area (not at edges)
|
||||
inner_margin = int(content_w * 0.08)
|
||||
valleys = [v for v in all_valleys if inner_margin < v[2] < content_w - inner_margin]
|
||||
|
||||
# If no valleys found with strict threshold, try local minima approach
|
||||
if len(valleys) < 2:
|
||||
logger.info("Layout: trying local minima approach for column detection")
|
||||
# Divide content into 20 segments, find the 2 lowest
|
||||
seg_count = 20
|
||||
seg_width = content_w // seg_count
|
||||
seg_scores = []
|
||||
for i in range(seg_count):
|
||||
sx = i * seg_width
|
||||
ex = min((i + 1) * seg_width, content_w)
|
||||
seg_mean = float(np.mean(v_proj_smooth[sx:ex]))
|
||||
seg_scores.append((i, sx, ex, seg_mean))
|
||||
|
||||
seg_scores.sort(key=lambda s: s[3])
|
||||
logger.info(f"Layout: segment scores (lowest 5): "
|
||||
f"{[(s[0], s[1]+left_x, s[2]+left_x, f'{s[3]:.4f}') for s in seg_scores[:5]]}")
|
||||
|
||||
# Find two lowest non-adjacent segments that create reasonable columns
|
||||
candidate_valleys = []
|
||||
for seg_idx, sx, ex, seg_mean in seg_scores:
|
||||
# Must not be at the edges
|
||||
if seg_idx <= 1 or seg_idx >= seg_count - 2:
|
||||
continue
|
||||
# Must be significantly lower than overall mean
|
||||
if seg_mean < p_mean * 0.6:
|
||||
center = (sx + ex) // 2
|
||||
candidate_valleys.append((sx, ex, center, ex - sx, seg_mean))
|
||||
|
||||
if len(candidate_valleys) >= 2:
|
||||
# Pick the best pair: non-adjacent, creating reasonable column widths
|
||||
candidate_valleys.sort(key=lambda v: v[2])
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(candidate_valleys)):
|
||||
for j in range(i + 1, len(candidate_valleys)):
|
||||
c1 = candidate_valleys[i][2]
|
||||
c2 = candidate_valleys[j][2]
|
||||
# Must be at least 20% apart
|
||||
if (c2 - c1) < content_w * 0.2:
|
||||
continue
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
# Each column at least 15%
|
||||
if col1 < content_w * 0.12 or col2 < content_w * 0.12 or col3 < content_w * 0.12:
|
||||
continue
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (candidate_valleys[i], candidate_valleys[j])
|
||||
|
||||
if best_pair:
|
||||
valleys = list(best_pair)
|
||||
logger.info(f"Layout: local minima found 2 valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
logger.info(f"Layout: final {len(valleys)} valleys: "
|
||||
f"{[(v[0]+left_x, v[1]+left_x, v[3]) for v in valleys]}")
|
||||
|
||||
regions = []
|
||||
|
||||
if len(valleys) >= 2:
|
||||
# 3-column layout detected
|
||||
valleys.sort(key=lambda v: v[2])
|
||||
|
||||
if len(valleys) == 2:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
else:
|
||||
# Pick the two valleys that best divide into 3 parts
|
||||
# Prefer wider valleys (more likely true separators)
|
||||
best_pair = None
|
||||
best_score = float('inf')
|
||||
for i in range(len(valleys)):
|
||||
for j in range(i + 1, len(valleys)):
|
||||
c1, c2 = valleys[i][2], valleys[j][2]
|
||||
# Each column should be at least 15% of content width
|
||||
col1 = c1
|
||||
col2 = c2 - c1
|
||||
col3 = content_w - c2
|
||||
if col1 < content_w * 0.15 or col2 < content_w * 0.15 or col3 < content_w * 0.15:
|
||||
continue
|
||||
# Score: lower is better (more even distribution)
|
||||
parts = sorted([col1, col2, col3])
|
||||
score = parts[2] - parts[0]
|
||||
# Bonus for wider valleys (subtract valley width)
|
||||
score -= (valleys[i][3] + valleys[j][3]) * 0.5
|
||||
if score < best_score:
|
||||
best_score = score
|
||||
best_pair = (c1, c2)
|
||||
if best_pair:
|
||||
sep1_center, sep2_center = best_pair
|
||||
else:
|
||||
sep1_center = valleys[0][2]
|
||||
sep2_center = valleys[1][2]
|
||||
|
||||
# Convert from content-relative to absolute coordinates
|
||||
abs_sep1 = sep1_center + left_x
|
||||
abs_sep2 = sep2_center + left_x
|
||||
|
||||
logger.info(f"Layout: 3 columns at separators x={abs_sep1}, x={abs_sep2} "
|
||||
f"(widths: {abs_sep1}, {abs_sep2-abs_sep1}, {w-abs_sep2})")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep1, y=top_y,
|
||||
width=abs_sep2 - abs_sep1, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_example', x=abs_sep2, y=top_y,
|
||||
width=w - abs_sep2, height=content_h
|
||||
))
|
||||
|
||||
elif len(valleys) == 1:
|
||||
# 2-column layout
|
||||
abs_sep = valleys[0][2] + left_x
|
||||
|
||||
logger.info(f"Layout: 2 columns at separator x={abs_sep}")
|
||||
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=abs_sep, height=content_h
|
||||
))
|
||||
regions.append(PageRegion(
|
||||
type='column_de', x=abs_sep, y=top_y,
|
||||
width=w - abs_sep, height=content_h
|
||||
))
|
||||
|
||||
else:
|
||||
# No columns detected — run full-page OCR as single column
|
||||
logger.warning("Layout: no column separators found, using full page")
|
||||
regions.append(PageRegion(
|
||||
type='column_en', x=0, y=top_y,
|
||||
width=w, height=content_h
|
||||
))
|
||||
|
||||
# Add header/footer info (gap-based detection with fallback)
|
||||
# Lazy import to avoid circular dependency with cv_layout.py
|
||||
from cv_layout_detection import _add_header_footer
|
||||
_add_header_footer(regions, top_y, bottom_y, w, h, inv=inv)
|
||||
|
||||
top_region = next((r.type for r in regions if r.type in ('header', 'margin_top')), 'none')
|
||||
bottom_region = next((r.type for r in regions if r.type in ('footer', 'margin_bottom')), 'none')
|
||||
col_count = len([r for r in regions if r.type.startswith('column')])
|
||||
logger.info(f"Layout: {col_count} columns, top={top_region}, bottom={bottom_region}")
|
||||
|
||||
return regions
|
||||
Reference in New Issue
Block a user