feat: add color detection for OCR word boxes

New cv_color_detect.py module: - detect_word_colors(): annotates existing words with text color (HSV analysis) - recover_colored_text(): finds colored text regions missed by standard OCR (e.g. red ! markers) using HSV masks + contour detection Integrated into build-grid: words get color/color_name fields, recovered colored regions are merged into the word list before grid building. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 00:50:09 +01:00
parent 39a4d8564c
commit 2bd63ec402
2 changed files with 280 additions and 1 deletions
@@ -0,0 +1,253 @@
+"""
+Color detection for OCR word boxes.
+
+Detects the text color of existing OCR words and recovers colored text
+regions (e.g. red markers, blue headings) that standard OCR may have missed.
+
+Standard OCR (Tesseract, PaddleOCR) binarises images before processing,
+destroying all color information.  This module adds it back by sampling
+HSV pixel values at word-box positions and finding colored regions that
+no word-box covers.
+
+Lizenz: Apache 2.0 (kommerziell nutzbar)
+DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import cv2
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# HSV color ranges  (OpenCV:  H 0-180,  S 0-255,  V 0-255)
+# ---------------------------------------------------------------------------
+
+_COLOR_RANGES: Dict[str, List[Tuple[np.ndarray, np.ndarray]]] = {
+    "red": [
+        (np.array([0, 70, 50]), np.array([10, 255, 255])),
+        (np.array([170, 70, 50]), np.array([180, 255, 255])),
+    ],
+    "orange": [
+        (np.array([10, 70, 50]), np.array([25, 255, 255])),
+    ],
+    "yellow": [
+        (np.array([25, 70, 50]), np.array([35, 255, 255])),
+    ],
+    "green": [
+        (np.array([35, 70, 50]), np.array([85, 255, 255])),
+    ],
+    "blue": [
+        (np.array([100, 70, 50]), np.array([130, 255, 255])),
+    ],
+    "purple": [
+        (np.array([130, 70, 50]), np.array([170, 255, 255])),
+    ],
+}
+
+_COLOR_HEX: Dict[str, str] = {
+    "black": "#000000",
+    "gray": "#6b7280",
+    "red": "#dc2626",
+    "orange": "#ea580c",
+    "yellow": "#ca8a04",
+    "green": "#16a34a",
+    "blue": "#2563eb",
+    "purple": "#9333ea",
+}
+
+
+def _hue_to_color_name(hue: float) -> str:
+    """Map OpenCV hue (0-180) to a color name."""
+    if hue < 10 or hue > 170:
+        return "red"
+    if hue < 25:
+        return "orange"
+    if hue < 35:
+        return "yellow"
+    if hue < 85:
+        return "green"
+    if hue < 130:
+        return "blue"
+    return "purple"
+
+
+# ---------------------------------------------------------------------------
+# 1.  Color annotation for existing word boxes
+# ---------------------------------------------------------------------------
+
+def detect_word_colors(
+    img_bgr: np.ndarray,
+    word_boxes: List[Dict],
+    sat_threshold: int = 50,
+) -> None:
+    """Annotate each word_box in-place with its detected text color.
+
+    Adds ``color`` (hex string) and ``color_name`` (e.g. 'red', 'black')
+    keys to each dict.
+
+    Algorithm per word:
+      1. Crop the word region from the image.
+      2. Build a text-pixel mask (dark pixels OR high-saturation pixels).
+      3. Sample HSV values at mask positions.
+      4. If mean saturation ≥ threshold → classify hue; else → black.
+    """
+    if img_bgr is None or not word_boxes:
+        return
+
+    img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    img_h, img_w = img_bgr.shape[:2]
+
+    colored_count = 0
+
+    for wb in word_boxes:
+        x1 = max(0, int(wb["left"]))
+        y1 = max(0, int(wb["top"]))
+        x2 = min(img_w, int(wb["left"] + wb["width"]))
+        y2 = min(img_h, int(wb["top"] + wb["height"]))
+
+        if x2 <= x1 or y2 <= y1:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+            continue
+
+        crop_hsv = img_hsv[y1:y2, x1:x2]
+        crop_gray = cv2.cvtColor(img_bgr[y1:y2, x1:x2], cv2.COLOR_BGR2GRAY)
+
+        # Text pixels: dark in grayscale OR saturated (colored ink)
+        _, dark_mask = cv2.threshold(crop_gray, 180, 255, cv2.THRESH_BINARY_INV)
+        sat_mask = (crop_hsv[:, :, 1] > sat_threshold).astype(np.uint8) * 255
+        text_mask = cv2.bitwise_or(dark_mask, sat_mask)
+
+        text_pixels = crop_hsv[text_mask > 0]
+
+        if len(text_pixels) < 3:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+            continue
+
+        mean_sat = float(np.mean(text_pixels[:, 1]))
+
+        if mean_sat < sat_threshold:
+            wb["color"] = _COLOR_HEX["black"]
+            wb["color_name"] = "black"
+        else:
+            mean_hue = float(np.mean(text_pixels[:, 0]))
+            name = _hue_to_color_name(mean_hue)
+            wb["color"] = _COLOR_HEX.get(name, _COLOR_HEX["black"])
+            wb["color_name"] = name
+            colored_count += 1
+
+    if colored_count:
+        logger.info("color annotation: %d / %d words are colored",
+                     colored_count, len(word_boxes))
+
+
+# ---------------------------------------------------------------------------
+# 2.  Recover colored text that OCR missed
+# ---------------------------------------------------------------------------
+
+def recover_colored_text(
+    img_bgr: np.ndarray,
+    existing_words: List[Dict],
+    min_area: int = 40,
+    max_regions: int = 60,
+) -> List[Dict]:
+    """Find colored text regions not covered by any existing word box.
+
+    Returns a list of recovered word dicts with ``color``, ``color_name``,
+    and ``recovered=True`` fields.  The ``text`` is set via a lightweight
+    shape heuristic (e.g. ``!`` for tall narrow shapes) or ``?``.
+    """
+    if img_bgr is None:
+        return []
+
+    img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
+    ih, iw = img_bgr.shape[:2]
+    max_area = int(ih * iw * 0.005)
+
+    # --- Build occupancy mask from existing words (with 4px padding) ---
+    occupied = np.zeros((ih, iw), dtype=np.uint8)
+    pad = 4
+    for wb in existing_words:
+        x1 = max(0, int(wb["left"]) - pad)
+        y1 = max(0, int(wb["top"]) - pad)
+        x2 = min(iw, int(wb["left"] + wb["width"]) + pad)
+        y2 = min(ih, int(wb["top"] + wb["height"]) + pad)
+        occupied[y1:y2, x1:x2] = 255
+
+    recovered: List[Dict] = []
+
+    for color_name, ranges in _COLOR_RANGES.items():
+        # Create mask for this color
+        mask = np.zeros((ih, iw), dtype=np.uint8)
+        for lower, upper in ranges:
+            mask = cv2.bitwise_or(mask, cv2.inRange(img_hsv, lower, upper))
+
+        # Remove pixels already covered by existing OCR words
+        mask = cv2.bitwise_and(mask, cv2.bitwise_not(occupied))
+
+        # Morphological cleanup:
+        # - Close with tall kernel to merge ! stroke + dot
+        # - Open to remove noise specks
+        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 8))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close)
+        kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_open)
+
+        contours, _ = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
+        )
+
+        candidates = []
+        for cnt in contours:
+            area = cv2.contourArea(cnt)
+            if area < min_area or area > max_area:
+                continue
+            bx, by, bw, bh = cv2.boundingRect(cnt)
+            if bh < 6:
+                continue
+            candidates.append((area, bx, by, bw, bh))
+
+        # Keep largest first, limited count
+        candidates.sort(key=lambda c: c[0], reverse=True)
+
+        for area, bx, by, bw, bh in candidates[:max_regions]:
+            text = _identify_shape(bw, bh)
+            recovered.append({
+                "text": text,
+                "left": bx,
+                "top": by,
+                "width": bw,
+                "height": bh,
+                "conf": 45,
+                "color": _COLOR_HEX.get(color_name, "#000000"),
+                "color_name": color_name,
+                "recovered": True,
+            })
+
+    if recovered:
+        logger.info(
+            "color recovery: %d colored regions found (%s)",
+            len(recovered),
+            ", ".join(
+                f"{c}: {sum(1 for r in recovered if r['color_name'] == c)}"
+                for c in sorted({r["color_name"] for r in recovered})
+            ),
+        )
+
+    return recovered
+
+
+def _identify_shape(w: int, h: int) -> str:
+    """Simple shape heuristic for common single-character text markers."""
+    aspect = w / h if h > 0 else 1.0
+    if aspect < 0.55 and h > 10:
+        # Tall, narrow — likely exclamation mark
+        return "!"
+    if 0.6 < aspect < 1.5 and max(w, h) < 25:
+        # Small, roughly square — bullet or dot
+        return "•"
+    return "?"
@@ -20,6 +20,7 @@ import numpy as np
 from fastapi import APIRouter, HTTPException, Request

 from cv_box_detect import detect_boxes, split_page_into_zones
+from cv_color_detect import detect_word_colors, recover_colored_text
 from cv_words_first import _cluster_rows, _build_cells
 from ocr_pipeline_session_store import (
    get_session_db,
@@ -438,15 +439,30 @@ async def build_grid(session_id: str):

    zones_data: List[Dict[str, Any]] = []
    boxes_detected = 0
+    recovered_count = 0
+    img_bgr = None

    content_x, content_y, content_w, content_h = _get_content_bounds(all_words)

    if img_png:
-        # Decode image for box detection
+        # Decode image for color detection + box detection
        arr = np.frombuffer(img_png, dtype=np.uint8)
        img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)

        if img_bgr is not None:
+            # --- Color detection: annotate existing words ---
+            detect_word_colors(img_bgr, all_words)
+
+            # --- Recover colored text that OCR missed ---
+            recovered = recover_colored_text(img_bgr, all_words)
+            if recovered:
+                recovered_count = len(recovered)
+                all_words.extend(recovered)
+                logger.info(
+                    "build-grid session %s: +%d recovered colored words",
+                    session_id, recovered_count,
+                )
+
            # Detect bordered boxes
            boxes = detect_boxes(
                img_bgr,
@@ -529,6 +545,14 @@ async def build_grid(session_id: str):
    total_columns = sum(len(z.get("columns", [])) for z in zones_data)
    total_rows = sum(len(z.get("rows", [])) for z in zones_data)

+    # Collect color statistics from all word_boxes in cells
+    color_stats: Dict[str, int] = {}
+    for z in zones_data:
+        for cell in z.get("cells", []):
+            for wb in cell.get("word_boxes", []):
+                cn = wb.get("color_name", "black")
+                color_stats[cn] = color_stats.get(cn, 0) + 1
+
    result = {
        "session_id": session_id,
        "image_width": img_w,
@@ -541,6 +565,8 @@ async def build_grid(session_id: str):
            "total_rows": total_rows,
            "total_cells": total_cells,
            "total_words": len(all_words),
+            "recovered_colored": recovered_count,
+            "color_stats": color_stats,
        },
        "formatting": {
            "bold_columns": [],