Add scan quality scoring, column limit, image enhancement (Steps 1-3)
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 32s
CI / test-python-klausur (push) Failing after 2m21s
CI / test-python-agent-core (push) Successful in 28s
CI / test-nodejs-website (push) Successful in 20s
Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -35,9 +35,15 @@ def _cluster_columns(
|
||||
words: List[Dict],
|
||||
img_w: int,
|
||||
min_gap_pct: float = 3.0,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Cluster words into columns by finding large horizontal gaps.
|
||||
|
||||
Args:
|
||||
max_columns: If set, limits the number of columns by merging
|
||||
the closest adjacent pairs until the count matches.
|
||||
Prevents phantom columns from degraded OCR.
|
||||
|
||||
Returns a list of column dicts:
|
||||
[{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
|
||||
sorted left-to-right.
|
||||
@@ -57,17 +63,28 @@ def _cluster_columns(
|
||||
|
||||
# Find X-gap boundaries between consecutive words (sorted by X-center)
|
||||
# For each word, compute right edge; for next word, compute left edge
|
||||
boundaries: List[float] = [] # X positions where columns split
|
||||
# Collect gaps with their sizes for max_columns enforcement
|
||||
gaps: List[Tuple[float, float]] = [] # (gap_size, split_x)
|
||||
for i in range(len(sorted_w) - 1):
|
||||
right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
|
||||
left_edge = sorted_w[i + 1]['left']
|
||||
gap = left_edge - right_edge
|
||||
if gap > min_gap_px:
|
||||
# Split point is midway through the gap
|
||||
boundaries.append((right_edge + left_edge) / 2)
|
||||
split_x = (right_edge + left_edge) / 2
|
||||
gaps.append((gap, split_x))
|
||||
|
||||
# If max_columns is set, keep only the (max_columns - 1) largest gaps
|
||||
if max_columns and len(gaps) >= max_columns:
|
||||
gaps.sort(key=lambda g: g[0], reverse=True)
|
||||
gaps = gaps[:max_columns - 1]
|
||||
logger.info(
|
||||
f"_cluster_columns: limited to {max_columns} columns "
|
||||
f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
|
||||
)
|
||||
|
||||
boundaries = sorted(g[1] for g in gaps)
|
||||
|
||||
# Build column ranges from boundaries
|
||||
# Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
|
||||
col_edges = [0.0] + boundaries + [float(img_w)]
|
||||
columns = []
|
||||
for ci in range(len(col_edges) - 1):
|
||||
@@ -302,6 +319,7 @@ def build_grid_from_words(
|
||||
img_h: int,
|
||||
min_confidence: int = 30,
|
||||
box_rects: Optional[List[Dict]] = None,
|
||||
max_columns: Optional[int] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""Build a cell grid bottom-up from Tesseract word boxes.
|
||||
|
||||
@@ -359,8 +377,9 @@ def build_grid_from_words(
|
||||
return [], []
|
||||
|
||||
# Step 1: cluster columns
|
||||
columns = _cluster_columns(words, img_w)
|
||||
logger.info("build_grid_from_words: %d column(s) detected", len(columns))
|
||||
columns = _cluster_columns(words, img_w, max_columns=max_columns)
|
||||
logger.info("build_grid_from_words: %d column(s) detected%s",
|
||||
len(columns), f" (max={max_columns})" if max_columns else "")
|
||||
|
||||
# Step 2: cluster rows
|
||||
rows = _cluster_rows(words)
|
||||
|
||||
Reference in New Issue
Block a user