feat: add color detection for OCR word boxes
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled

New cv_color_detect.py module:
- detect_word_colors(): annotates existing words with text color (HSV analysis)
- recover_colored_text(): finds colored text regions missed by standard OCR
  (e.g. red ! markers) using HSV masks + contour detection

Integrated into build-grid: words get color/color_name fields, recovered
colored regions are merged into the word list before grid building.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-15 00:50:09 +01:00
parent 39a4d8564c
commit 2bd63ec402
2 changed files with 280 additions and 1 deletions

View File

@@ -20,6 +20,7 @@ import numpy as np
from fastapi import APIRouter, HTTPException, Request
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_color_detect import detect_word_colors, recover_colored_text
from cv_words_first import _cluster_rows, _build_cells
from ocr_pipeline_session_store import (
get_session_db,
@@ -438,15 +439,30 @@ async def build_grid(session_id: str):
zones_data: List[Dict[str, Any]] = []
boxes_detected = 0
recovered_count = 0
img_bgr = None
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
if img_png:
# Decode image for box detection
# Decode image for color detection + box detection
arr = np.frombuffer(img_png, dtype=np.uint8)
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img_bgr is not None:
# --- Color detection: annotate existing words ---
detect_word_colors(img_bgr, all_words)
# --- Recover colored text that OCR missed ---
recovered = recover_colored_text(img_bgr, all_words)
if recovered:
recovered_count = len(recovered)
all_words.extend(recovered)
logger.info(
"build-grid session %s: +%d recovered colored words",
session_id, recovered_count,
)
# Detect bordered boxes
boxes = detect_boxes(
img_bgr,
@@ -529,6 +545,14 @@ async def build_grid(session_id: str):
total_columns = sum(len(z.get("columns", [])) for z in zones_data)
total_rows = sum(len(z.get("rows", [])) for z in zones_data)
# Collect color statistics from all word_boxes in cells
color_stats: Dict[str, int] = {}
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
cn = wb.get("color_name", "black")
color_stats[cn] = color_stats.get(cn, 0) + 1
result = {
"session_id": session_id,
"image_width": img_w,
@@ -541,6 +565,8 @@ async def build_grid(session_id: str):
"total_rows": total_rows,
"total_cells": total_cells,
"total_words": len(all_words),
"recovered_colored": recovered_count,
"color_stats": color_stats,
},
"formatting": {
"bold_columns": [],