Compare commits

...

3 Commits

Author SHA1 Message Date
Benjamin Admin
c7ae44ff17 feat(rag): add 42 new regulations to RAG overview + update collection totals
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 33s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m46s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 23s
New regulations across bp_compliance_ce (11), bp_compliance_gesetze (31),
and bp_compliance_datenschutz (1). Collection totals updated:
gesetze 58304, ce 18183, datenschutz 2448, total 103912.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 01:04:27 +01:00
Benjamin Admin
ce0815007e feat(ocr-pipeline): replace clustering column detection with whitespace-gap analysis
Column detection now uses vertical projection profiles to find whitespace
gaps between columns, then validates gaps against word bounding boxes to
prevent splitting through words. Old clustering algorithm extracted as
fallback (_detect_columns_by_clustering) for pages with < 2 detected gaps.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 00:36:28 +01:00
Benjamin Admin
b03cb0a1e6 Fix Landkarte tab crash: variable name shadowed isInRag function
Local variables named 'isInRag' shadowed the outer function, causing
"isInRag is not a function" error. Renamed to regInRag/codeInRag.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 00:01:01 +01:00
2 changed files with 353 additions and 145 deletions

View File

@@ -1198,8 +1198,56 @@ const REGULATIONS_IN_RAG: Record<string, { collection: string; chunks: number }>
PL_UODO: { collection: 'bp_compliance_gesetze', chunks: 198 }, PL_UODO: { collection: 'bp_compliance_gesetze', chunks: 198 },
CZ_ZOU: { collection: 'bp_compliance_gesetze', chunks: 1120 }, CZ_ZOU: { collection: 'bp_compliance_gesetze', chunks: 1120 },
HU_INFOTV: { collection: 'bp_compliance_gesetze', chunks: 1345 }, HU_INFOTV: { collection: 'bp_compliance_gesetze', chunks: 1345 },
// EDPB Guidelines (bp_compliance_datenschutz: 2.101 total) // EDPB Guidelines (bp_compliance_datenschutz)
EDPB_GUIDELINES_5_2020: { collection: 'bp_compliance_datenschutz', chunks: 245 }, EDPB_GUIDELINES_5_2020: { collection: 'bp_compliance_datenschutz', chunks: 245 },
EDPB_GUIDELINES_7_2020: { collection: 'bp_compliance_datenschutz', chunks: 347 },
// === Neue Regulierungen (2026-02-28) ===
// EU CE-Regulierungen (bp_compliance_ce)
DPF: { collection: 'bp_compliance_ce', chunks: 1232 },
EUCSA: { collection: 'bp_compliance_ce', chunks: 558 },
DATAACT: { collection: 'bp_compliance_ce', chunks: 809 },
DORA: { collection: 'bp_compliance_ce', chunks: 823 },
PSD2: { collection: 'bp_compliance_ce', chunks: 796 },
AMLR: { collection: 'bp_compliance_ce', chunks: 1182 },
MiCA: { collection: 'bp_compliance_ce', chunks: 1640 },
EHDS: { collection: 'bp_compliance_ce', chunks: 1212 },
EAA: { collection: 'bp_compliance_ce', chunks: 433 },
DSM: { collection: 'bp_compliance_ce', chunks: 416 },
GPSR: { collection: 'bp_compliance_ce', chunks: 509 },
// DE Gesetze (bp_compliance_gesetze)
DE_UWG: { collection: 'bp_compliance_gesetze', chunks: 1 },
DE_TKG: { collection: 'bp_compliance_gesetze', chunks: 1631 },
DE_PANGV: { collection: 'bp_compliance_gesetze', chunks: 1 },
DE_DLINFOV: { collection: 'bp_compliance_gesetze', chunks: 21 },
DE_BETRVG: { collection: 'bp_compliance_gesetze', chunks: 498 },
DE_GESCHGEHG: { collection: 'bp_compliance_gesetze', chunks: 63 },
DE_BSIG: { collection: 'bp_compliance_gesetze', chunks: 1 },
DE_USTG_RET: { collection: 'bp_compliance_gesetze', chunks: 1071 },
// AT Gesetze (bp_compliance_gesetze)
AT_DSG_FULL: { collection: 'bp_compliance_gesetze', chunks: 6 },
LI_DSG: { collection: 'bp_compliance_gesetze', chunks: 2 },
AT_ECG: { collection: 'bp_compliance_gesetze', chunks: 120 },
AT_TKG: { collection: 'bp_compliance_gesetze', chunks: 2174 },
AT_KSCHG: { collection: 'bp_compliance_gesetze', chunks: 402 },
AT_FAGG: { collection: 'bp_compliance_gesetze', chunks: 2 },
AT_UGB_RET: { collection: 'bp_compliance_gesetze', chunks: 2828 },
AT_BAO_RET: { collection: 'bp_compliance_gesetze', chunks: 2246 },
AT_MEDIENG: { collection: 'bp_compliance_gesetze', chunks: 571 },
AT_ABGB_AGB: { collection: 'bp_compliance_gesetze', chunks: 2521 },
AT_UWG: { collection: 'bp_compliance_gesetze', chunks: 403 },
// CH Gesetze (bp_compliance_gesetze)
CH_DSV: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_OR_AGB: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_UWG: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_FMG: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_GEBUV: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_ZERTES: { collection: 'bp_compliance_gesetze', chunks: 5 },
CH_ZGB_PERS: { collection: 'bp_compliance_gesetze', chunks: 5 },
// Weitere EU-Laender (bp_compliance_gesetze)
BE_DPA_LAW: { collection: 'bp_compliance_gesetze', chunks: 3 },
FI_TIETOSUOJALAKI: { collection: 'bp_compliance_gesetze', chunks: 2 },
DK_DATABESKYTTELSESLOVEN: { collection: 'bp_compliance_gesetze', chunks: 2 },
LU_DPA_LAW: { collection: 'bp_compliance_gesetze', chunks: 2 },
} }
// Helper: Check if regulation is in RAG // Helper: Check if regulation is in RAG
@@ -1208,17 +1256,17 @@ const isInRag = (code: string): boolean => code in REGULATIONS_IN_RAG
// Helper: Get known chunk count for a regulation // Helper: Get known chunk count for a regulation
const getKnownChunks = (code: string): number => REGULATIONS_IN_RAG[code]?.chunks || 0 const getKnownChunks = (code: string): number => REGULATIONS_IN_RAG[code]?.chunks || 0
// Known collection totals (updated: 2026-02-27) // Known collection totals (updated: 2026-02-28)
const COLLECTION_TOTALS = { const COLLECTION_TOTALS = {
bp_compliance_gesetze: 33929, bp_compliance_gesetze: 58304,
bp_compliance_ce: 7341, bp_compliance_ce: 18183,
bp_legal_templates: 7689, bp_legal_templates: 7689,
bp_compliance_datenschutz: 2101, bp_compliance_datenschutz: 2448,
bp_dsfa_corpus: 7867, bp_dsfa_corpus: 7867,
bp_compliance_recht: 1425, bp_compliance_recht: 1425,
bp_nibis_eh: 7996, bp_nibis_eh: 7996,
total_legal: 33929 + 7341, // gesetze + ce total_legal: 76487, // gesetze + ce
total_all: 68348, total_all: 103912,
} }
// License display labels // License display labels
@@ -2466,17 +2514,17 @@ export default function RAGPage() {
</div> </div>
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-3"> <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-3">
{regs.map((reg) => { {regs.map((reg) => {
const isInRag = isInRag(reg.code) const regInRag = isInRag(reg.code)
return ( return (
<div <div
key={reg.code} key={reg.code}
className={`bg-white p-3 rounded-lg border ${isInRag ? 'border-green-200' : 'border-slate-200'}`} className={`bg-white p-3 rounded-lg border ${regInRag ? 'border-green-200' : 'border-slate-200'}`}
> >
<div className="flex items-center gap-2 mb-1"> <div className="flex items-center gap-2 mb-1">
<span className={`px-2 py-0.5 text-xs rounded ${TYPE_COLORS[reg.type]}`}> <span className={`px-2 py-0.5 text-xs rounded ${TYPE_COLORS[reg.type]}`}>
{reg.code} {reg.code}
</span> </span>
{isInRag ? ( {regInRag ? (
<span className="px-1.5 py-0.5 text-[10px] font-bold bg-green-100 text-green-600 rounded">RAG</span> <span className="px-1.5 py-0.5 text-[10px] font-bold bg-green-100 text-green-600 rounded">RAG</span>
) : ( ) : (
<span className="px-1.5 py-0.5 text-[10px] font-bold bg-red-50 text-red-400 rounded"></span> <span className="px-1.5 py-0.5 text-[10px] font-bold bg-red-50 text-red-400 rounded"></span>
@@ -2513,12 +2561,12 @@ export default function RAGPage() {
<div className="flex flex-wrap gap-2"> <div className="flex flex-wrap gap-2">
{group.regulations.map((code) => { {group.regulations.map((code) => {
const reg = REGULATIONS.find(r => r.code === code) const reg = REGULATIONS.find(r => r.code === code)
const isInRag = isInRag(code) const codeInRag = isInRag(code)
return ( return (
<span <span
key={code} key={code}
className={`px-3 py-1.5 rounded-full text-sm font-medium cursor-pointer ${ className={`px-3 py-1.5 rounded-full text-sm font-medium cursor-pointer ${
isInRag codeInRag
? 'bg-green-100 text-green-700 hover:bg-green-200' ? 'bg-green-100 text-green-700 hover:bg-green-200'
: 'bg-slate-100 text-slate-700 hover:bg-slate-200' : 'bg-slate-100 text-slate-700 hover:bg-slate-200'
}`} }`}
@@ -2526,9 +2574,9 @@ export default function RAGPage() {
setActiveTab('regulations') setActiveTab('regulations')
setExpandedRegulation(code) setExpandedRegulation(code)
}} }}
title={`${reg?.fullName || code}${isInRag ? ' (im RAG)' : ' (nicht im RAG)'}`} title={`${reg?.fullName || code}${codeInRag ? ' (im RAG)' : ' (nicht im RAG)'}`}
> >
{isInRag ? '✓ ' : '✗ '}{code} {codeInRag ? '✓ ' : '✗ '}{code}
</span> </span>
) )
})} })}

View File

@@ -875,11 +875,147 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
# --- Phase A: Geometry Detection --- # --- Phase A: Geometry Detection ---
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]: def _detect_columns_by_clustering(
"""Detect column geometry by clustering left-aligned word positions. word_dicts: List[Dict],
left_edges: List[int],
edge_word_indices: List[int],
content_w: int,
content_h: int,
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
"""Fallback: detect columns by clustering left-aligned word positions.
Phase A of the two-phase column detection. Returns untyped column Used when the primary gap-based algorithm finds fewer than 2 gaps.
geometries with their words for subsequent content-based classification. """
tolerance = max(10, int(content_w * 0.01))
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
clusters = []
cluster_widxs = []
cur_edges = [sorted_pairs[0][0]]
cur_widxs = [sorted_pairs[0][1]]
for edge, widx in sorted_pairs[1:]:
if edge - cur_edges[-1] <= tolerance:
cur_edges.append(edge)
cur_widxs.append(widx)
else:
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
cur_edges = [edge]
cur_widxs = [widx]
clusters.append(cur_edges)
cluster_widxs.append(cur_widxs)
MIN_Y_COVERAGE_PRIMARY = 0.30
MIN_Y_COVERAGE_SECONDARY = 0.15
MIN_WORDS_SECONDARY = 5
cluster_infos = []
for c_edges, c_widxs in zip(clusters, cluster_widxs):
if len(c_edges) < 2:
continue
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
y_span = max(y_positions) - min(y_positions)
y_coverage = y_span / content_h if content_h > 0 else 0.0
cluster_infos.append({
'mean_x': int(np.mean(c_edges)),
'count': len(c_edges),
'min_edge': min(c_edges),
'max_edge': max(c_edges),
'y_min': min(y_positions),
'y_max': max(y_positions),
'y_coverage': y_coverage,
})
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
primary_set = set(id(c) for c in primary)
secondary = [c for c in cluster_infos
if id(c) not in primary_set
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
and c['count'] >= MIN_WORDS_SECONDARY]
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
if len(significant) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
return None
merge_distance = max(30, int(content_w * 0.06))
merged = [significant[0].copy()]
for s in significant[1:]:
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
prev = merged[-1]
total = prev['count'] + s['count']
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
prev['mean_x'] = avg_x
prev['count'] = total
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
else:
merged.append(s.copy())
if len(merged) < 3:
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
return None
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
margin_px = max(6, int(content_w * 0.003))
return _build_geometries_from_starts(
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
)
def _build_geometries_from_starts(
col_starts: List[Tuple[int, int]],
word_dicts: List[Dict],
left_x: int,
right_x: int,
top_y: int,
bottom_y: int,
content_w: int,
content_h: int,
) -> Tuple[List[ColumnGeometry], int, int, int, int]:
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
geometries = []
for i, (start_x, count) in enumerate(col_starts):
if i + 1 < len(col_starts):
col_width = col_starts[i + 1][0] - start_x
else:
col_width = right_x - start_x
col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel]
geometries.append(ColumnGeometry(
index=i,
x=start_x,
y=top_y,
width=col_width,
height=content_h,
word_count=len(col_words),
words=col_words,
width_ratio=col_width / content_w if content_w > 0 else 0.0,
))
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
return (geometries, left_x, right_x, top_y, bottom_y)
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
"""Detect column geometry using whitespace-gap analysis with word validation.
Phase A of the two-phase column detection. Uses vertical projection
profiles to find whitespace gaps between columns, then validates that
no gap cuts through a word bounding box.
Falls back to clustering-based detection if fewer than 2 gaps are found.
Args: Args:
ocr_img: Binarized grayscale image for layout analysis. ocr_img: Binarized grayscale image for layout analysis.
@@ -887,11 +1023,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
Returns: Returns:
Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if
fewer than 3 clusters are found (signals fallback needed). detection fails entirely.
""" """
h, w = ocr_img.shape[:2] h, w = ocr_img.shape[:2]
# --- Find content bounds --- # --- Step 1: Find content bounds ---
inv = cv2.bitwise_not(ocr_img) inv = cv2.bitwise_not(ocr_img)
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv) left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
content_w = right_x - left_x content_w = right_x - left_x
@@ -905,7 +1041,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), " logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
f"y=[{top_y}..{bottom_y}] ({content_h}px)") f"y=[{top_y}..{bottom_y}] ({content_h}px)")
# --- Get word bounding boxes from Tesseract --- # --- Step 2: Get word bounding boxes from Tesseract ---
content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x] content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x]
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB)) pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
@@ -915,10 +1051,9 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}") logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
return None return None
# Collect words with their full info
word_dicts = [] word_dicts = []
left_edges = [] left_edges = []
edge_word_indices = [] # Track which word_dicts index each edge belongs to edge_word_indices = []
n_words = len(data['text']) n_words = len(data['text'])
for i in range(n_words): for i in range(n_words):
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1 conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
@@ -942,146 +1077,171 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area") logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
# --- Cluster left edges (tracking word indices per cluster) --- # --- Step 3: Vertical projection profile ---
tolerance = max(10, int(content_w * 0.01)) content_strip = inv[top_y:bottom_y, left_x:right_x]
v_proj = np.sum(content_strip, axis=0).astype(float)
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
# Sort edges while keeping word index association # Smooth the projection to avoid noise-induced micro-gaps
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0]) kernel_size = max(5, content_w // 80)
if kernel_size % 2 == 0:
kernel_size += 1 # keep odd for symmetry
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
clusters = [] # list of lists of edge x-values # --- Step 4: Find whitespace gaps ---
cluster_widxs = [] # parallel list of lists of word_dicts indices # Threshold: areas with very little ink density are gaps
cur_edges = [sorted_pairs[0][0]] median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
cur_widxs = [sorted_pairs[0][1]] gap_threshold = max(median_density * 0.15, 0.005)
for edge, widx in sorted_pairs[1:]:
if edge - cur_edges[-1] <= tolerance: in_gap = v_smooth < gap_threshold
cur_edges.append(edge) MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
cur_widxs.append(widx)
# Collect contiguous gap regions
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
gap_start = None
for x in range(len(in_gap)):
if in_gap[x]:
if gap_start is None:
gap_start = x
else: else:
clusters.append(cur_edges) if gap_start is not None:
cluster_widxs.append(cur_widxs) gap_width = x - gap_start
cur_edges = [edge] if gap_width >= MIN_GAP_WIDTH:
cur_widxs = [widx] raw_gaps.append((gap_start, x))
clusters.append(cur_edges) gap_start = None
cluster_widxs.append(cur_widxs) # Handle gap at the right edge
if gap_start is not None:
gap_width = len(in_gap) - gap_start
if gap_width >= MIN_GAP_WIDTH:
raw_gaps.append((gap_start, len(in_gap)))
# --- Enrich clusters with Y-span info and apply verticality filter --- logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
MIN_Y_COVERAGE_PRIMARY = 0.30 # Primary columns span >= 30% of page height f"min_width={MIN_GAP_WIDTH}px): "
MIN_Y_COVERAGE_SECONDARY = 0.15 # Secondary columns span >= 15% f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
MIN_WORDS_SECONDARY = 5 # Secondary columns need >= 5 words
cluster_infos = [] # --- Step 5: Validate gaps against word bounding boxes ---
for c_edges, c_widxs in zip(clusters, cluster_widxs): validated_gaps = []
if len(c_edges) < 2: for gap_start_rel, gap_end_rel in raw_gaps:
continue # Check if any word overlaps with this gap region
y_positions = [word_dicts[idx]['top'] for idx in c_widxs] overlapping = False
y_span = max(y_positions) - min(y_positions) for wd in word_dicts:
y_coverage = y_span / content_h if content_h > 0 else 0.0 word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
overlapping = True
break
cluster_infos.append({ if not overlapping:
'mean_x': int(np.mean(c_edges)), validated_gaps.append((gap_start_rel, gap_end_rel))
'count': len(c_edges),
'min_edge': min(c_edges),
'max_edge': max(c_edges),
'y_min': min(y_positions),
'y_max': max(y_positions),
'y_coverage': y_coverage,
})
_ci_summary = [(ci['mean_x']+left_x, ci['count'], format(ci['y_coverage'], '.0%')) for ci in cluster_infos[:12]]
logger.info(f"ColumnGeometry: {len(cluster_infos)} clusters with >=2 words "
f"(from {len(clusters)} total), y_coverage: {_ci_summary}")
# Primary: good vertical coverage
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
# Secondary: moderate coverage with enough words
primary_set = set(id(c) for c in primary)
secondary = [c for c in cluster_infos
if id(c) not in primary_set
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
and c['count'] >= MIN_WORDS_SECONDARY]
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
_sig_summary = [(s['mean_x']+left_x, s['count'], format(s['y_coverage'], '.0%')) for s in significant[:10]]
logger.info(f"ColumnGeometry: {len(significant)} significant clusters "
f"(primary={len(primary)}, secondary={len(secondary)}): {_sig_summary}")
if len(significant) < 3:
logger.info("ColumnGeometry: < 3 clusters after verticality filter, signaling fallback")
return None
# --- Merge clusters that are very close ---
# 6% of content width: on a typical 5-col vocab page (~1500px wide),
# this is ~90px, which merges sub-alignments within a single column
# while keeping real column boundaries (~300px apart) separate.
merge_distance = max(30, int(content_w * 0.06))
merged = [significant[0].copy()]
for s in significant[1:]:
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
prev = merged[-1]
total = prev['count'] + s['count']
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
prev['mean_x'] = avg_x
prev['count'] = total
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
prev['y_min'] = min(prev['y_min'], s['y_min'])
prev['y_max'] = max(prev['y_max'], s['y_max'])
prev['y_coverage'] = (prev['y_max'] - prev['y_min']) / content_h if content_h > 0 else 0.0
else: else:
merged.append(s.copy()) # Try to shift the gap to avoid the overlapping word(s)
# Find the tightest word boundaries within the gap region
min_word_left = content_w
max_word_right = 0
for wd in word_dicts:
word_left = wd['left']
word_right = wd['left'] + wd['width']
if word_left < gap_end_rel and word_right > gap_start_rel:
min_word_left = min(min_word_left, word_left)
max_word_right = max(max_word_right, word_right)
# --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors --- # Try gap before the overlapping words
i = 0 if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
absorbed_count = 0 validated_gaps.append((gap_start_rel, min_word_left))
while i < len(merged) and len(merged) > 3: logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
if i + 1 < len(merged): # Try gap after the overlapping words
cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x'] elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
else: validated_gaps.append((max_word_right, gap_end_rel))
cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x']) logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
if cluster_w / content_w < 0.05:
# Absorb into neighbor (prefer left)
if i > 0:
target = merged[i - 1]
else: else:
target = merged[i + 1] logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
target['count'] += merged[i]['count'] f"discarded (word overlap, no room to shift)")
target['min_edge'] = min(target['min_edge'], merged[i]['min_edge'])
target['max_edge'] = max(target['max_edge'], merged[i]['max_edge'])
target['y_min'] = min(target['y_min'], merged[i]['y_min'])
target['y_max'] = max(target['y_max'], merged[i]['y_max'])
target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0
del merged[i]
absorbed_count += 1
else:
i += 1
if absorbed_count:
logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)")
_merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged] logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}") f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
if len(merged) < 3: # --- Step 6: Fallback to clustering if too few gaps ---
logger.info("ColumnGeometry: < 3 merged clusters, signaling fallback") if len(validated_gaps) < 2:
return None logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
return _detect_columns_by_clustering(
word_dicts, left_edges, edge_word_indices,
content_w, content_h, left_x, right_x, top_y, bottom_y,
)
# --- Derive column boundaries --- # --- Step 7: Derive column boundaries from gaps ---
margin_px = max(6, int(content_w * 0.003)) # ~2mm margin before column start # Sort gaps by position
validated_gaps.sort(key=lambda g: g[0])
# Identify margin gaps (first and last) vs interior gaps
# A margin gap touches the edge of the content area (within 2% tolerance)
edge_tolerance = max(10, int(content_w * 0.02))
is_left_margin = validated_gaps[0][0] <= edge_tolerance
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
# Interior gaps define column boundaries
# Column starts at the end of a gap, ends at the start of the next gap
col_starts = [] col_starts = []
for m in merged:
abs_start = max(0, left_x + m['min_edge'] - margin_px)
col_starts.append((abs_start, m['count']))
# Calculate column widths and assign words to columns if is_left_margin:
geometries = [] # First column starts after the left margin gap
for i, (start_x, count) in enumerate(col_starts): first_gap_end = validated_gaps[0][1]
interior_gaps = validated_gaps[1:]
else:
# No left margin gap — first column starts at content left edge
first_gap_end = 0
interior_gaps = validated_gaps[:]
if is_right_margin:
# Last gap is right margin — don't use it as column start
interior_gaps_for_boundaries = interior_gaps[:-1]
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
else:
interior_gaps_for_boundaries = interior_gaps
right_boundary = content_w
# First column
col_starts.append(left_x + first_gap_end)
# Columns between interior gaps
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
col_starts.append(left_x + gap_end_rel)
# Count words per column region (for logging)
col_start_counts = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts): if i + 1 < len(col_starts):
col_width = col_starts[i + 1][0] - start_x next_start = col_starts[i + 1]
elif is_right_margin:
next_start = left_x + right_boundary
else: else:
col_width = right_x - start_x next_start = right_x
# Assign words to this column based on left edge col_left_rel = start_x - left_x
col_right_rel = next_start - left_x
n_words_in_col = sum(1 for w in word_dicts
if col_left_rel <= w['left'] < col_right_rel)
col_start_counts.append((start_x, n_words_in_col))
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
f"{col_start_counts}")
# --- Step 8: Build ColumnGeometry objects ---
# Determine right edge for each column
all_boundaries = []
for i, start_x in enumerate(col_starts):
if i + 1 < len(col_starts):
end_x = col_starts[i + 1]
elif is_right_margin:
end_x = left_x + right_boundary
else:
end_x = right_x
all_boundaries.append((start_x, end_x))
geometries = []
for i, (start_x, end_x) in enumerate(all_boundaries):
col_width = end_x - start_x
col_left_rel = start_x - left_x col_left_rel = start_x - left_x
col_right_rel = col_left_rel + col_width col_right_rel = col_left_rel + col_width
col_words = [w for w in word_dicts col_words = [w for w in word_dicts