Compare commits
3 Commits
5a45cbf605
...
c7ae44ff17
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c7ae44ff17 | ||
|
|
ce0815007e | ||
|
|
b03cb0a1e6 |
@@ -1198,8 +1198,56 @@ const REGULATIONS_IN_RAG: Record<string, { collection: string; chunks: number }>
|
||||
PL_UODO: { collection: 'bp_compliance_gesetze', chunks: 198 },
|
||||
CZ_ZOU: { collection: 'bp_compliance_gesetze', chunks: 1120 },
|
||||
HU_INFOTV: { collection: 'bp_compliance_gesetze', chunks: 1345 },
|
||||
// EDPB Guidelines (bp_compliance_datenschutz: 2.101 total)
|
||||
// EDPB Guidelines (bp_compliance_datenschutz)
|
||||
EDPB_GUIDELINES_5_2020: { collection: 'bp_compliance_datenschutz', chunks: 245 },
|
||||
EDPB_GUIDELINES_7_2020: { collection: 'bp_compliance_datenschutz', chunks: 347 },
|
||||
// === Neue Regulierungen (2026-02-28) ===
|
||||
// EU CE-Regulierungen (bp_compliance_ce)
|
||||
DPF: { collection: 'bp_compliance_ce', chunks: 1232 },
|
||||
EUCSA: { collection: 'bp_compliance_ce', chunks: 558 },
|
||||
DATAACT: { collection: 'bp_compliance_ce', chunks: 809 },
|
||||
DORA: { collection: 'bp_compliance_ce', chunks: 823 },
|
||||
PSD2: { collection: 'bp_compliance_ce', chunks: 796 },
|
||||
AMLR: { collection: 'bp_compliance_ce', chunks: 1182 },
|
||||
MiCA: { collection: 'bp_compliance_ce', chunks: 1640 },
|
||||
EHDS: { collection: 'bp_compliance_ce', chunks: 1212 },
|
||||
EAA: { collection: 'bp_compliance_ce', chunks: 433 },
|
||||
DSM: { collection: 'bp_compliance_ce', chunks: 416 },
|
||||
GPSR: { collection: 'bp_compliance_ce', chunks: 509 },
|
||||
// DE Gesetze (bp_compliance_gesetze)
|
||||
DE_UWG: { collection: 'bp_compliance_gesetze', chunks: 1 },
|
||||
DE_TKG: { collection: 'bp_compliance_gesetze', chunks: 1631 },
|
||||
DE_PANGV: { collection: 'bp_compliance_gesetze', chunks: 1 },
|
||||
DE_DLINFOV: { collection: 'bp_compliance_gesetze', chunks: 21 },
|
||||
DE_BETRVG: { collection: 'bp_compliance_gesetze', chunks: 498 },
|
||||
DE_GESCHGEHG: { collection: 'bp_compliance_gesetze', chunks: 63 },
|
||||
DE_BSIG: { collection: 'bp_compliance_gesetze', chunks: 1 },
|
||||
DE_USTG_RET: { collection: 'bp_compliance_gesetze', chunks: 1071 },
|
||||
// AT Gesetze (bp_compliance_gesetze)
|
||||
AT_DSG_FULL: { collection: 'bp_compliance_gesetze', chunks: 6 },
|
||||
LI_DSG: { collection: 'bp_compliance_gesetze', chunks: 2 },
|
||||
AT_ECG: { collection: 'bp_compliance_gesetze', chunks: 120 },
|
||||
AT_TKG: { collection: 'bp_compliance_gesetze', chunks: 2174 },
|
||||
AT_KSCHG: { collection: 'bp_compliance_gesetze', chunks: 402 },
|
||||
AT_FAGG: { collection: 'bp_compliance_gesetze', chunks: 2 },
|
||||
AT_UGB_RET: { collection: 'bp_compliance_gesetze', chunks: 2828 },
|
||||
AT_BAO_RET: { collection: 'bp_compliance_gesetze', chunks: 2246 },
|
||||
AT_MEDIENG: { collection: 'bp_compliance_gesetze', chunks: 571 },
|
||||
AT_ABGB_AGB: { collection: 'bp_compliance_gesetze', chunks: 2521 },
|
||||
AT_UWG: { collection: 'bp_compliance_gesetze', chunks: 403 },
|
||||
// CH Gesetze (bp_compliance_gesetze)
|
||||
CH_DSV: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_OR_AGB: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_UWG: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_FMG: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_GEBUV: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_ZERTES: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
CH_ZGB_PERS: { collection: 'bp_compliance_gesetze', chunks: 5 },
|
||||
// Weitere EU-Laender (bp_compliance_gesetze)
|
||||
BE_DPA_LAW: { collection: 'bp_compliance_gesetze', chunks: 3 },
|
||||
FI_TIETOSUOJALAKI: { collection: 'bp_compliance_gesetze', chunks: 2 },
|
||||
DK_DATABESKYTTELSESLOVEN: { collection: 'bp_compliance_gesetze', chunks: 2 },
|
||||
LU_DPA_LAW: { collection: 'bp_compliance_gesetze', chunks: 2 },
|
||||
}
|
||||
|
||||
// Helper: Check if regulation is in RAG
|
||||
@@ -1208,17 +1256,17 @@ const isInRag = (code: string): boolean => code in REGULATIONS_IN_RAG
|
||||
// Helper: Get known chunk count for a regulation
|
||||
const getKnownChunks = (code: string): number => REGULATIONS_IN_RAG[code]?.chunks || 0
|
||||
|
||||
// Known collection totals (updated: 2026-02-27)
|
||||
// Known collection totals (updated: 2026-02-28)
|
||||
const COLLECTION_TOTALS = {
|
||||
bp_compliance_gesetze: 33929,
|
||||
bp_compliance_ce: 7341,
|
||||
bp_compliance_gesetze: 58304,
|
||||
bp_compliance_ce: 18183,
|
||||
bp_legal_templates: 7689,
|
||||
bp_compliance_datenschutz: 2101,
|
||||
bp_compliance_datenschutz: 2448,
|
||||
bp_dsfa_corpus: 7867,
|
||||
bp_compliance_recht: 1425,
|
||||
bp_nibis_eh: 7996,
|
||||
total_legal: 33929 + 7341, // gesetze + ce
|
||||
total_all: 68348,
|
||||
total_legal: 76487, // gesetze + ce
|
||||
total_all: 103912,
|
||||
}
|
||||
|
||||
// License display labels
|
||||
@@ -2466,17 +2514,17 @@ export default function RAGPage() {
|
||||
</div>
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-3">
|
||||
{regs.map((reg) => {
|
||||
const isInRag = isInRag(reg.code)
|
||||
const regInRag = isInRag(reg.code)
|
||||
return (
|
||||
<div
|
||||
key={reg.code}
|
||||
className={`bg-white p-3 rounded-lg border ${isInRag ? 'border-green-200' : 'border-slate-200'}`}
|
||||
className={`bg-white p-3 rounded-lg border ${regInRag ? 'border-green-200' : 'border-slate-200'}`}
|
||||
>
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<span className={`px-2 py-0.5 text-xs rounded ${TYPE_COLORS[reg.type]}`}>
|
||||
{reg.code}
|
||||
</span>
|
||||
{isInRag ? (
|
||||
{regInRag ? (
|
||||
<span className="px-1.5 py-0.5 text-[10px] font-bold bg-green-100 text-green-600 rounded">RAG</span>
|
||||
) : (
|
||||
<span className="px-1.5 py-0.5 text-[10px] font-bold bg-red-50 text-red-400 rounded">✗</span>
|
||||
@@ -2513,12 +2561,12 @@ export default function RAGPage() {
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{group.regulations.map((code) => {
|
||||
const reg = REGULATIONS.find(r => r.code === code)
|
||||
const isInRag = isInRag(code)
|
||||
const codeInRag = isInRag(code)
|
||||
return (
|
||||
<span
|
||||
key={code}
|
||||
className={`px-3 py-1.5 rounded-full text-sm font-medium cursor-pointer ${
|
||||
isInRag
|
||||
codeInRag
|
||||
? 'bg-green-100 text-green-700 hover:bg-green-200'
|
||||
: 'bg-slate-100 text-slate-700 hover:bg-slate-200'
|
||||
}`}
|
||||
@@ -2526,9 +2574,9 @@ export default function RAGPage() {
|
||||
setActiveTab('regulations')
|
||||
setExpandedRegulation(code)
|
||||
}}
|
||||
title={`${reg?.fullName || code}${isInRag ? ' (im RAG)' : ' (nicht im RAG)'}`}
|
||||
title={`${reg?.fullName || code}${codeInRag ? ' (im RAG)' : ' (nicht im RAG)'}`}
|
||||
>
|
||||
{isInRag ? '✓ ' : '✗ '}{code}
|
||||
{codeInRag ? '✓ ' : '✗ '}{code}
|
||||
</span>
|
||||
)
|
||||
})}
|
||||
|
||||
@@ -875,11 +875,147 @@ def analyze_layout(layout_img: np.ndarray, ocr_img: np.ndarray) -> List[PageRegi
|
||||
|
||||
# --- Phase A: Geometry Detection ---
|
||||
|
||||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
|
||||
"""Detect column geometry by clustering left-aligned word positions.
|
||||
def _detect_columns_by_clustering(
|
||||
word_dicts: List[Dict],
|
||||
left_edges: List[int],
|
||||
edge_word_indices: List[int],
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
|
||||
"""Fallback: detect columns by clustering left-aligned word positions.
|
||||
|
||||
Phase A of the two-phase column detection. Returns untyped column
|
||||
geometries with their words for subsequent content-based classification.
|
||||
Used when the primary gap-based algorithm finds fewer than 2 gaps.
|
||||
"""
|
||||
tolerance = max(10, int(content_w * 0.01))
|
||||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||||
|
||||
clusters = []
|
||||
cluster_widxs = []
|
||||
cur_edges = [sorted_pairs[0][0]]
|
||||
cur_widxs = [sorted_pairs[0][1]]
|
||||
for edge, widx in sorted_pairs[1:]:
|
||||
if edge - cur_edges[-1] <= tolerance:
|
||||
cur_edges.append(edge)
|
||||
cur_widxs.append(widx)
|
||||
else:
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
cur_edges = [edge]
|
||||
cur_widxs = [widx]
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
|
||||
MIN_Y_COVERAGE_PRIMARY = 0.30
|
||||
MIN_Y_COVERAGE_SECONDARY = 0.15
|
||||
MIN_WORDS_SECONDARY = 5
|
||||
|
||||
cluster_infos = []
|
||||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||||
if len(c_edges) < 2:
|
||||
continue
|
||||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||||
y_span = max(y_positions) - min(y_positions)
|
||||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||||
cluster_infos.append({
|
||||
'mean_x': int(np.mean(c_edges)),
|
||||
'count': len(c_edges),
|
||||
'min_edge': min(c_edges),
|
||||
'max_edge': max(c_edges),
|
||||
'y_min': min(y_positions),
|
||||
'y_max': max(y_positions),
|
||||
'y_coverage': y_coverage,
|
||||
})
|
||||
|
||||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||||
primary_set = set(id(c) for c in primary)
|
||||
secondary = [c for c in cluster_infos
|
||||
if id(c) not in primary_set
|
||||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||||
|
||||
if len(significant) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 significant clusters")
|
||||
return None
|
||||
|
||||
merge_distance = max(30, int(content_w * 0.06))
|
||||
merged = [significant[0].copy()]
|
||||
for s in significant[1:]:
|
||||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||||
prev = merged[-1]
|
||||
total = prev['count'] + s['count']
|
||||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||||
prev['mean_x'] = avg_x
|
||||
prev['count'] = total
|
||||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||||
else:
|
||||
merged.append(s.copy())
|
||||
|
||||
if len(merged) < 3:
|
||||
logger.info("ColumnGeometry clustering fallback: < 3 merged clusters")
|
||||
return None
|
||||
|
||||
logger.info(f"ColumnGeometry clustering fallback: {len(merged)} columns from clustering")
|
||||
|
||||
margin_px = max(6, int(content_w * 0.003))
|
||||
return _build_geometries_from_starts(
|
||||
[(max(0, left_x + m['min_edge'] - margin_px), m['count']) for m in merged],
|
||||
word_dicts, left_x, right_x, top_y, bottom_y, content_w, content_h,
|
||||
)
|
||||
|
||||
|
||||
def _build_geometries_from_starts(
|
||||
col_starts: List[Tuple[int, int]],
|
||||
word_dicts: List[Dict],
|
||||
left_x: int,
|
||||
right_x: int,
|
||||
top_y: int,
|
||||
bottom_y: int,
|
||||
content_w: int,
|
||||
content_h: int,
|
||||
) -> Tuple[List[ColumnGeometry], int, int, int, int]:
|
||||
"""Build ColumnGeometry objects from a list of (abs_start_x, word_count) pairs."""
|
||||
geometries = []
|
||||
for i, (start_x, count) in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
col_width = col_starts[i + 1][0] - start_x
|
||||
else:
|
||||
col_width = right_x - start_x
|
||||
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel]
|
||||
|
||||
geometries.append(ColumnGeometry(
|
||||
index=i,
|
||||
x=start_x,
|
||||
y=top_y,
|
||||
width=col_width,
|
||||
height=content_h,
|
||||
word_count=len(col_words),
|
||||
words=col_words,
|
||||
width_ratio=col_width / content_w if content_w > 0 else 0.0,
|
||||
))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(geometries)} columns: "
|
||||
f"{[(g.index, g.x, g.width, g.word_count) for g in geometries]}")
|
||||
return (geometries, left_x, right_x, top_y, bottom_y)
|
||||
|
||||
|
||||
def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Optional[Tuple[List[ColumnGeometry], int, int, int, int]]:
|
||||
"""Detect column geometry using whitespace-gap analysis with word validation.
|
||||
|
||||
Phase A of the two-phase column detection. Uses vertical projection
|
||||
profiles to find whitespace gaps between columns, then validates that
|
||||
no gap cuts through a word bounding box.
|
||||
|
||||
Falls back to clustering-based detection if fewer than 2 gaps are found.
|
||||
|
||||
Args:
|
||||
ocr_img: Binarized grayscale image for layout analysis.
|
||||
@@ -887,11 +1023,11 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
|
||||
Returns:
|
||||
Tuple of (geometries, left_x, right_x, top_y, bottom_y) or None if
|
||||
fewer than 3 clusters are found (signals fallback needed).
|
||||
detection fails entirely.
|
||||
"""
|
||||
h, w = ocr_img.shape[:2]
|
||||
|
||||
# --- Find content bounds ---
|
||||
# --- Step 1: Find content bounds ---
|
||||
inv = cv2.bitwise_not(ocr_img)
|
||||
left_x, right_x, top_y, bottom_y = _find_content_bounds(inv)
|
||||
content_w = right_x - left_x
|
||||
@@ -905,7 +1041,7 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
logger.info(f"ColumnGeometry: content bounds x=[{left_x}..{right_x}] ({content_w}px), "
|
||||
f"y=[{top_y}..{bottom_y}] ({content_h}px)")
|
||||
|
||||
# --- Get word bounding boxes from Tesseract ---
|
||||
# --- Step 2: Get word bounding boxes from Tesseract ---
|
||||
content_roi = dewarped_bgr[top_y:bottom_y, left_x:right_x]
|
||||
pil_img = Image.fromarray(cv2.cvtColor(content_roi, cv2.COLOR_BGR2RGB))
|
||||
|
||||
@@ -915,10 +1051,9 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
logger.warning(f"ColumnGeometry: Tesseract image_to_data failed: {e}")
|
||||
return None
|
||||
|
||||
# Collect words with their full info
|
||||
word_dicts = []
|
||||
left_edges = []
|
||||
edge_word_indices = [] # Track which word_dicts index each edge belongs to
|
||||
edge_word_indices = []
|
||||
n_words = len(data['text'])
|
||||
for i in range(n_words):
|
||||
conf = int(data['conf'][i]) if str(data['conf'][i]).lstrip('-').isdigit() else -1
|
||||
@@ -942,146 +1077,171 @@ def detect_column_geometry(ocr_img: np.ndarray, dewarped_bgr: np.ndarray) -> Opt
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(left_edges)} words detected in content area")
|
||||
|
||||
# --- Cluster left edges (tracking word indices per cluster) ---
|
||||
tolerance = max(10, int(content_w * 0.01))
|
||||
# --- Step 3: Vertical projection profile ---
|
||||
content_strip = inv[top_y:bottom_y, left_x:right_x]
|
||||
v_proj = np.sum(content_strip, axis=0).astype(float)
|
||||
v_proj_norm = v_proj / (content_h * 255) if content_h > 0 else v_proj
|
||||
|
||||
# Sort edges while keeping word index association
|
||||
sorted_pairs = sorted(zip(left_edges, edge_word_indices), key=lambda p: p[0])
|
||||
# Smooth the projection to avoid noise-induced micro-gaps
|
||||
kernel_size = max(5, content_w // 80)
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size += 1 # keep odd for symmetry
|
||||
v_smooth = np.convolve(v_proj_norm, np.ones(kernel_size) / kernel_size, mode='same')
|
||||
|
||||
clusters = [] # list of lists of edge x-values
|
||||
cluster_widxs = [] # parallel list of lists of word_dicts indices
|
||||
cur_edges = [sorted_pairs[0][0]]
|
||||
cur_widxs = [sorted_pairs[0][1]]
|
||||
for edge, widx in sorted_pairs[1:]:
|
||||
if edge - cur_edges[-1] <= tolerance:
|
||||
cur_edges.append(edge)
|
||||
cur_widxs.append(widx)
|
||||
# --- Step 4: Find whitespace gaps ---
|
||||
# Threshold: areas with very little ink density are gaps
|
||||
median_density = float(np.median(v_smooth[v_smooth > 0])) if np.any(v_smooth > 0) else 0.01
|
||||
gap_threshold = max(median_density * 0.15, 0.005)
|
||||
|
||||
in_gap = v_smooth < gap_threshold
|
||||
MIN_GAP_WIDTH = max(8, content_w // 200) # min ~8px or 0.5% of content width
|
||||
|
||||
# Collect contiguous gap regions
|
||||
raw_gaps = [] # (start_x_rel, end_x_rel) relative to content ROI
|
||||
gap_start = None
|
||||
for x in range(len(in_gap)):
|
||||
if in_gap[x]:
|
||||
if gap_start is None:
|
||||
gap_start = x
|
||||
else:
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
cur_edges = [edge]
|
||||
cur_widxs = [widx]
|
||||
clusters.append(cur_edges)
|
||||
cluster_widxs.append(cur_widxs)
|
||||
if gap_start is not None:
|
||||
gap_width = x - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, x))
|
||||
gap_start = None
|
||||
# Handle gap at the right edge
|
||||
if gap_start is not None:
|
||||
gap_width = len(in_gap) - gap_start
|
||||
if gap_width >= MIN_GAP_WIDTH:
|
||||
raw_gaps.append((gap_start, len(in_gap)))
|
||||
|
||||
# --- Enrich clusters with Y-span info and apply verticality filter ---
|
||||
MIN_Y_COVERAGE_PRIMARY = 0.30 # Primary columns span >= 30% of page height
|
||||
MIN_Y_COVERAGE_SECONDARY = 0.15 # Secondary columns span >= 15%
|
||||
MIN_WORDS_SECONDARY = 5 # Secondary columns need >= 5 words
|
||||
logger.info(f"ColumnGeometry: {len(raw_gaps)} raw gaps found (threshold={gap_threshold:.4f}, "
|
||||
f"min_width={MIN_GAP_WIDTH}px): "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in raw_gaps]}")
|
||||
|
||||
cluster_infos = []
|
||||
for c_edges, c_widxs in zip(clusters, cluster_widxs):
|
||||
if len(c_edges) < 2:
|
||||
continue
|
||||
y_positions = [word_dicts[idx]['top'] for idx in c_widxs]
|
||||
y_span = max(y_positions) - min(y_positions)
|
||||
y_coverage = y_span / content_h if content_h > 0 else 0.0
|
||||
# --- Step 5: Validate gaps against word bounding boxes ---
|
||||
validated_gaps = []
|
||||
for gap_start_rel, gap_end_rel in raw_gaps:
|
||||
# Check if any word overlaps with this gap region
|
||||
overlapping = False
|
||||
for wd in word_dicts:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
overlapping = True
|
||||
break
|
||||
|
||||
cluster_infos.append({
|
||||
'mean_x': int(np.mean(c_edges)),
|
||||
'count': len(c_edges),
|
||||
'min_edge': min(c_edges),
|
||||
'max_edge': max(c_edges),
|
||||
'y_min': min(y_positions),
|
||||
'y_max': max(y_positions),
|
||||
'y_coverage': y_coverage,
|
||||
})
|
||||
|
||||
_ci_summary = [(ci['mean_x']+left_x, ci['count'], format(ci['y_coverage'], '.0%')) for ci in cluster_infos[:12]]
|
||||
logger.info(f"ColumnGeometry: {len(cluster_infos)} clusters with >=2 words "
|
||||
f"(from {len(clusters)} total), y_coverage: {_ci_summary}")
|
||||
|
||||
# Primary: good vertical coverage
|
||||
primary = [c for c in cluster_infos if c['y_coverage'] >= MIN_Y_COVERAGE_PRIMARY]
|
||||
# Secondary: moderate coverage with enough words
|
||||
primary_set = set(id(c) for c in primary)
|
||||
secondary = [c for c in cluster_infos
|
||||
if id(c) not in primary_set
|
||||
and c['y_coverage'] >= MIN_Y_COVERAGE_SECONDARY
|
||||
and c['count'] >= MIN_WORDS_SECONDARY]
|
||||
|
||||
significant = sorted(primary + secondary, key=lambda c: c['mean_x'])
|
||||
|
||||
_sig_summary = [(s['mean_x']+left_x, s['count'], format(s['y_coverage'], '.0%')) for s in significant[:10]]
|
||||
logger.info(f"ColumnGeometry: {len(significant)} significant clusters "
|
||||
f"(primary={len(primary)}, secondary={len(secondary)}): {_sig_summary}")
|
||||
|
||||
if len(significant) < 3:
|
||||
logger.info("ColumnGeometry: < 3 clusters after verticality filter, signaling fallback")
|
||||
return None
|
||||
|
||||
# --- Merge clusters that are very close ---
|
||||
# 6% of content width: on a typical 5-col vocab page (~1500px wide),
|
||||
# this is ~90px, which merges sub-alignments within a single column
|
||||
# while keeping real column boundaries (~300px apart) separate.
|
||||
merge_distance = max(30, int(content_w * 0.06))
|
||||
merged = [significant[0].copy()]
|
||||
for s in significant[1:]:
|
||||
if s['mean_x'] - merged[-1]['mean_x'] < merge_distance:
|
||||
prev = merged[-1]
|
||||
total = prev['count'] + s['count']
|
||||
avg_x = (prev['mean_x'] * prev['count'] + s['mean_x'] * s['count']) // total
|
||||
prev['mean_x'] = avg_x
|
||||
prev['count'] = total
|
||||
prev['min_edge'] = min(prev['min_edge'], s['min_edge'])
|
||||
prev['max_edge'] = max(prev['max_edge'], s['max_edge'])
|
||||
prev['y_min'] = min(prev['y_min'], s['y_min'])
|
||||
prev['y_max'] = max(prev['y_max'], s['y_max'])
|
||||
prev['y_coverage'] = (prev['y_max'] - prev['y_min']) / content_h if content_h > 0 else 0.0
|
||||
if not overlapping:
|
||||
validated_gaps.append((gap_start_rel, gap_end_rel))
|
||||
else:
|
||||
merged.append(s.copy())
|
||||
# Try to shift the gap to avoid the overlapping word(s)
|
||||
# Find the tightest word boundaries within the gap region
|
||||
min_word_left = content_w
|
||||
max_word_right = 0
|
||||
for wd in word_dicts:
|
||||
word_left = wd['left']
|
||||
word_right = wd['left'] + wd['width']
|
||||
if word_left < gap_end_rel and word_right > gap_start_rel:
|
||||
min_word_left = min(min_word_left, word_left)
|
||||
max_word_right = max(max_word_right, word_right)
|
||||
|
||||
# --- Post-merge: absorb tiny clusters (< 5% content width) into neighbors ---
|
||||
i = 0
|
||||
absorbed_count = 0
|
||||
while i < len(merged) and len(merged) > 3:
|
||||
if i + 1 < len(merged):
|
||||
cluster_w = merged[i + 1]['mean_x'] - merged[i]['mean_x']
|
||||
else:
|
||||
cluster_w = content_w - (merged[i]['mean_x'] - merged[0]['mean_x'])
|
||||
if cluster_w / content_w < 0.05:
|
||||
# Absorb into neighbor (prefer left)
|
||||
if i > 0:
|
||||
target = merged[i - 1]
|
||||
# Try gap before the overlapping words
|
||||
if min_word_left - gap_start_rel >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((gap_start_rel, min_word_left))
|
||||
logger.debug(f"ColumnGeometry: gap shifted left to avoid word at {min_word_left}")
|
||||
# Try gap after the overlapping words
|
||||
elif gap_end_rel - max_word_right >= MIN_GAP_WIDTH:
|
||||
validated_gaps.append((max_word_right, gap_end_rel))
|
||||
logger.debug(f"ColumnGeometry: gap shifted right to avoid word at {max_word_right}")
|
||||
else:
|
||||
target = merged[i + 1]
|
||||
target['count'] += merged[i]['count']
|
||||
target['min_edge'] = min(target['min_edge'], merged[i]['min_edge'])
|
||||
target['max_edge'] = max(target['max_edge'], merged[i]['max_edge'])
|
||||
target['y_min'] = min(target['y_min'], merged[i]['y_min'])
|
||||
target['y_max'] = max(target['y_max'], merged[i]['y_max'])
|
||||
target['y_coverage'] = (target['y_max'] - target['y_min']) / content_h if content_h > 0 else 0.0
|
||||
del merged[i]
|
||||
absorbed_count += 1
|
||||
else:
|
||||
i += 1
|
||||
if absorbed_count:
|
||||
logger.info(f"ColumnGeometry: absorbed {absorbed_count} tiny clusters (< 5% width)")
|
||||
logger.debug(f"ColumnGeometry: gap [{gap_start_rel}..{gap_end_rel}] "
|
||||
f"discarded (word overlap, no room to shift)")
|
||||
|
||||
_merged_summary = [(m['mean_x']+left_x, m['count'], format(m['y_coverage'], '.0%')) for m in merged]
|
||||
logger.info(f"ColumnGeometry: {len(merged)} clusters after merging (dist={merge_distance}px): {_merged_summary}")
|
||||
logger.info(f"ColumnGeometry: {len(validated_gaps)} gaps after word validation: "
|
||||
f"{[(g[0]+left_x, g[1]+left_x, g[1]-g[0]) for g in validated_gaps]}")
|
||||
|
||||
if len(merged) < 3:
|
||||
logger.info("ColumnGeometry: < 3 merged clusters, signaling fallback")
|
||||
return None
|
||||
# --- Step 6: Fallback to clustering if too few gaps ---
|
||||
if len(validated_gaps) < 2:
|
||||
logger.info("ColumnGeometry: < 2 gaps found, falling back to clustering")
|
||||
return _detect_columns_by_clustering(
|
||||
word_dicts, left_edges, edge_word_indices,
|
||||
content_w, content_h, left_x, right_x, top_y, bottom_y,
|
||||
)
|
||||
|
||||
# --- Derive column boundaries ---
|
||||
margin_px = max(6, int(content_w * 0.003)) # ~2mm margin before column start
|
||||
# --- Step 7: Derive column boundaries from gaps ---
|
||||
# Sort gaps by position
|
||||
validated_gaps.sort(key=lambda g: g[0])
|
||||
|
||||
# Identify margin gaps (first and last) vs interior gaps
|
||||
# A margin gap touches the edge of the content area (within 2% tolerance)
|
||||
edge_tolerance = max(10, int(content_w * 0.02))
|
||||
|
||||
is_left_margin = validated_gaps[0][0] <= edge_tolerance
|
||||
is_right_margin = validated_gaps[-1][1] >= content_w - edge_tolerance
|
||||
|
||||
# Interior gaps define column boundaries
|
||||
# Column starts at the end of a gap, ends at the start of the next gap
|
||||
col_starts = []
|
||||
for m in merged:
|
||||
abs_start = max(0, left_x + m['min_edge'] - margin_px)
|
||||
col_starts.append((abs_start, m['count']))
|
||||
|
||||
# Calculate column widths and assign words to columns
|
||||
geometries = []
|
||||
for i, (start_x, count) in enumerate(col_starts):
|
||||
if is_left_margin:
|
||||
# First column starts after the left margin gap
|
||||
first_gap_end = validated_gaps[0][1]
|
||||
interior_gaps = validated_gaps[1:]
|
||||
else:
|
||||
# No left margin gap — first column starts at content left edge
|
||||
first_gap_end = 0
|
||||
interior_gaps = validated_gaps[:]
|
||||
|
||||
if is_right_margin:
|
||||
# Last gap is right margin — don't use it as column start
|
||||
interior_gaps_for_boundaries = interior_gaps[:-1]
|
||||
right_boundary = validated_gaps[-1][0] # last column ends at right margin gap start
|
||||
else:
|
||||
interior_gaps_for_boundaries = interior_gaps
|
||||
right_boundary = content_w
|
||||
|
||||
# First column
|
||||
col_starts.append(left_x + first_gap_end)
|
||||
|
||||
# Columns between interior gaps
|
||||
for gap_start_rel, gap_end_rel in interior_gaps_for_boundaries:
|
||||
col_starts.append(left_x + gap_end_rel)
|
||||
|
||||
# Count words per column region (for logging)
|
||||
col_start_counts = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
col_width = col_starts[i + 1][0] - start_x
|
||||
next_start = col_starts[i + 1]
|
||||
elif is_right_margin:
|
||||
next_start = left_x + right_boundary
|
||||
else:
|
||||
col_width = right_x - start_x
|
||||
next_start = right_x
|
||||
|
||||
# Assign words to this column based on left edge
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = next_start - left_x
|
||||
n_words_in_col = sum(1 for w in word_dicts
|
||||
if col_left_rel <= w['left'] < col_right_rel)
|
||||
col_start_counts.append((start_x, n_words_in_col))
|
||||
|
||||
logger.info(f"ColumnGeometry: {len(col_starts)} columns from {len(validated_gaps)} gaps "
|
||||
f"(left_margin={is_left_margin}, right_margin={is_right_margin}): "
|
||||
f"{col_start_counts}")
|
||||
|
||||
# --- Step 8: Build ColumnGeometry objects ---
|
||||
# Determine right edge for each column
|
||||
all_boundaries = []
|
||||
for i, start_x in enumerate(col_starts):
|
||||
if i + 1 < len(col_starts):
|
||||
end_x = col_starts[i + 1]
|
||||
elif is_right_margin:
|
||||
end_x = left_x + right_boundary
|
||||
else:
|
||||
end_x = right_x
|
||||
all_boundaries.append((start_x, end_x))
|
||||
|
||||
geometries = []
|
||||
for i, (start_x, end_x) in enumerate(all_boundaries):
|
||||
col_width = end_x - start_x
|
||||
col_left_rel = start_x - left_x
|
||||
col_right_rel = col_left_rel + col_width
|
||||
col_words = [w for w in word_dicts
|
||||
|
||||
Reference in New Issue
Block a user