Add scan quality scoring, column limit, image enhancement (Steps 1-3)

Step 1: scan_quality.py — Laplacian blur + contrast scoring, adjusts OCR confidence threshold (40 for good scans, 30 for degraded). Quality report included in API response + shown in frontend. Step 2: max_columns parameter in cv_words_first.py — limits column detection to 3 for vocab tables, preventing phantom columns D/E from degraded OCR fragments. Step 3: ocr_image_enhance.py — CLAHE contrast + bilateral filter denoising + unsharp mask, only for degraded scans (gated by quality score). Pattern from handwriting_htr_api.py. Frontend: quality info shown in extraction status after processing. Reprocess button now derives pages from vocabulary data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-23 14:58:39 +02:00
parent 5a154b744d
commit 2f34ee9ede
5 changed files with 267 additions and 16 deletions
--- a/klausur-service/backend/cv_words_first.py
+++ b/klausur-service/backend/cv_words_first.py
@@ -35,9 +35,15 @@ def _cluster_columns(
    words: List[Dict],
    img_w: int,
    min_gap_pct: float = 3.0,
+    max_columns: Optional[int] = None,
 ) -> List[Dict[str, Any]]:
    """Cluster words into columns by finding large horizontal gaps.

+    Args:
+        max_columns: If set, limits the number of columns by merging
+            the closest adjacent pairs until the count matches.
+            Prevents phantom columns from degraded OCR.
+
    Returns a list of column dicts:
        [{'index': 0, 'type': 'column_1', 'x_min': ..., 'x_max': ...}, ...]
    sorted left-to-right.
@@ -57,17 +63,28 @@ def _cluster_columns(

    # Find X-gap boundaries between consecutive words (sorted by X-center)
    # For each word, compute right edge; for next word, compute left edge
-    boundaries: List[float] = []  # X positions where columns split
+    # Collect gaps with their sizes for max_columns enforcement
+    gaps: List[Tuple[float, float]] = []  # (gap_size, split_x)
    for i in range(len(sorted_w) - 1):
        right_edge = sorted_w[i]['left'] + sorted_w[i]['width']
        left_edge = sorted_w[i + 1]['left']
        gap = left_edge - right_edge
        if gap > min_gap_px:
-            # Split point is midway through the gap
-            boundaries.append((right_edge + left_edge) / 2)
+            split_x = (right_edge + left_edge) / 2
+            gaps.append((gap, split_x))
+
+    # If max_columns is set, keep only the (max_columns - 1) largest gaps
+    if max_columns and len(gaps) >= max_columns:
+        gaps.sort(key=lambda g: g[0], reverse=True)
+        gaps = gaps[:max_columns - 1]
+        logger.info(
+            f"_cluster_columns: limited to {max_columns} columns "
+            f"(removed {len(gaps) + max_columns - 1 - (max_columns - 1)} smallest gaps)"
+        )
+
+    boundaries = sorted(g[1] for g in gaps)

    # Build column ranges from boundaries
-    # Column ranges: (-inf, boundary[0]), (boundary[0], boundary[1]), ..., (boundary[-1], +inf)
    col_edges = [0.0] + boundaries + [float(img_w)]
    columns = []
    for ci in range(len(col_edges) - 1):
@@ -302,6 +319,7 @@ def build_grid_from_words(
    img_h: int,
    min_confidence: int = 30,
    box_rects: Optional[List[Dict]] = None,
+    max_columns: Optional[int] = None,
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Build a cell grid bottom-up from Tesseract word boxes.

@@ -359,8 +377,9 @@ def build_grid_from_words(
            return [], []

    # Step 1: cluster columns
-    columns = _cluster_columns(words, img_w)
-    logger.info("build_grid_from_words: %d column(s) detected", len(columns))
+    columns = _cluster_columns(words, img_w, max_columns=max_columns)
+    logger.info("build_grid_from_words: %d column(s) detected%s",
+                len(columns), f" (max={max_columns})" if max_columns else "")

    # Step 2: cluster rows
    rows = _cluster_rows(words)