revert: remove marker column OCR special handling
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 25s
CI / test-go-edu-search (push) Successful in 25s
CI / test-python-klausur (push) Failing after 1m48s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 18s
The HSV-based coloured marker detection caused false positives in nearly every marker cell. Coloured markers like red "!" are an extreme edge case — better handled manually in reconstruction. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4621,39 +4621,10 @@ def _ocr_single_cell(
|
|||||||
# in an otherwise empty cell.
|
# in an otherwise empty cell.
|
||||||
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
dark_ratio = float(np.count_nonzero(crop < 180)) / crop.size
|
||||||
_run_fallback = dark_ratio > 0.005
|
_run_fallback = dark_ratio > 0.005
|
||||||
# For marker columns, also check the colour image — red/coloured
|
|
||||||
# markers appear near-white in grayscale but have high saturation.
|
|
||||||
if not _run_fallback and img_bgr is not None and col.type == 'column_marker':
|
|
||||||
bgr_crop = img_bgr[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
|
||||||
if bgr_crop.size > 0:
|
|
||||||
hsv = cv2.cvtColor(bgr_crop, cv2.COLOR_BGR2HSV)
|
|
||||||
# Saturation > 80 indicates coloured (non-grey) pixels
|
|
||||||
sat_ratio = float(np.count_nonzero(hsv[:, :, 1] > 80)) / (hsv.shape[0] * hsv.shape[1])
|
|
||||||
if sat_ratio > 0.005:
|
|
||||||
_run_fallback = True
|
|
||||||
if _run_fallback:
|
if _run_fallback:
|
||||||
# For marker columns with coloured content (e.g. red "!"), convert
|
|
||||||
# the BGR crop to a binarized grayscale that preserves saturated pixels.
|
|
||||||
_marker_ocr_img = ocr_img
|
|
||||||
if col.type == 'column_marker' and img_bgr is not None:
|
|
||||||
bgr_crop_full = img_bgr[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
|
||||||
if bgr_crop_full.size > 0:
|
|
||||||
hsv = cv2.cvtColor(bgr_crop_full, cv2.COLOR_BGR2HSV)
|
|
||||||
# Create mask: either dark (V<180) or saturated (S>60)
|
|
||||||
dark_mask = hsv[:, :, 2] < 180
|
|
||||||
sat_mask = hsv[:, :, 1] > 60
|
|
||||||
combined = dark_mask | sat_mask
|
|
||||||
# Build grayscale: foreground=0 (black), background=255 (white)
|
|
||||||
marker_gray = np.full(combined.shape, 255, dtype=np.uint8)
|
|
||||||
marker_gray[combined] = 0
|
|
||||||
# Place into a full-size image at the crop position
|
|
||||||
_marker_full = np.full_like(ocr_img, 255)
|
|
||||||
_marker_full[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w] = marker_gray
|
|
||||||
_marker_ocr_img = _marker_full
|
|
||||||
|
|
||||||
# For narrow columns, upscale the crop before OCR
|
# For narrow columns, upscale the crop before OCR
|
||||||
if is_narrow and _marker_ocr_img is not None:
|
if is_narrow and ocr_img is not None:
|
||||||
_crop_slice = _marker_ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
_crop_slice = ocr_img[cell_y:cell_y + cell_h, cell_x:cell_x + cell_w]
|
||||||
_upscaled = _ensure_minimum_crop_size(_crop_slice)
|
_upscaled = _ensure_minimum_crop_size(_crop_slice)
|
||||||
if _upscaled is not _crop_slice:
|
if _upscaled is not _crop_slice:
|
||||||
# Build a temporary full-size image with the upscaled crop
|
# Build a temporary full-size image with the upscaled crop
|
||||||
@@ -4682,7 +4653,7 @@ def _ocr_single_cell(
|
|||||||
)
|
)
|
||||||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
fallback_words = ocr_region(_marker_ocr_img, cell_region,
|
fallback_words = ocr_region(ocr_img, cell_region,
|
||||||
lang=cell_lang, psm=_cell_psm)
|
lang=cell_lang, psm=_cell_psm)
|
||||||
else:
|
else:
|
||||||
cell_region = PageRegion(
|
cell_region = PageRegion(
|
||||||
@@ -4699,7 +4670,7 @@ def _ocr_single_cell(
|
|||||||
else:
|
else:
|
||||||
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
_cell_psm = _select_psm_for_column(col.type, col.width, row.height)
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
fallback_words = ocr_region(_marker_ocr_img, cell_region,
|
fallback_words = ocr_region(ocr_img, cell_region,
|
||||||
lang=cell_lang, psm=_cell_psm)
|
lang=cell_lang, psm=_cell_psm)
|
||||||
|
|
||||||
if fallback_words:
|
if fallback_words:
|
||||||
@@ -4723,7 +4694,7 @@ def _ocr_single_cell(
|
|||||||
width=cell_w, height=cell_h,
|
width=cell_w, height=cell_h,
|
||||||
)
|
)
|
||||||
cell_lang = lang_map.get(col.type, lang)
|
cell_lang = lang_map.get(col.type, lang)
|
||||||
psm7_words = ocr_region(_marker_ocr_img, _fb_region, lang=cell_lang, psm=7)
|
psm7_words = ocr_region(ocr_img, _fb_region, lang=cell_lang, psm=7)
|
||||||
if psm7_words:
|
if psm7_words:
|
||||||
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
psm7_words = [w for w in psm7_words if w.get('conf', 0) >= _MIN_WORD_CONF]
|
||||||
if psm7_words:
|
if psm7_words:
|
||||||
@@ -4768,9 +4739,7 @@ def _ocr_single_cell(
|
|||||||
used_engine = 'row_strip_rapid'
|
used_engine = 'row_strip_rapid'
|
||||||
|
|
||||||
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
# --- NOISE FILTER: clear cells that contain only OCR artifacts ---
|
||||||
# Skip noise filter for marker columns — they legitimately contain
|
if text.strip():
|
||||||
# only punctuation like "!" or "*" which _clean_cell_text would remove.
|
|
||||||
if text.strip() and col.type != 'column_marker':
|
|
||||||
text = _clean_cell_text(text)
|
text = _clean_cell_text(text)
|
||||||
if not text:
|
if not text:
|
||||||
avg_conf = 0.0
|
avg_conf = 0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user