fix: hard-filter OCR words inside detected graphic regions
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 1m51s
CI / test-python-agent-core (push) Successful in 18s
CI / test-nodejs-website (push) Successful in 16s

Run detect_graphic_elements() in the grid pipeline after image loading
and remove ALL words whose centroids fall inside detected graphic regions,
regardless of confidence. Previously only low-confidence words (conf < 50)
were removed, letting artifacts like "Tr", "Su" survive.

Changes:
- grid_editor_api.py: Import and call detect_graphic_elements() at Step 3a,
  passing only significant words (len >= 3) to avoid short artifacts fooling
  the text-vs-graphic heuristic. Hard-filter all words in graphic regions.
- cv_graphic_detect.py: Lower density threshold from 20% to 5% for large
  regions (>100x80px) — photos/illustrations have low color saturation.
  Raise page-spanning limit from 50% to 60% width/height.

Tested: 5 ground-truth sessions pass regression (079cd0d9, d8533a2c,
2838c7a7, 4233d7e3, 5997b635). Session 5997 now detects 2 graphic regions
and removes 29 artifact words including "Tr" and "Su".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-22 10:18:23 +01:00
parent 7b3319be2e
commit 4a44ad7986
2 changed files with 57 additions and 25 deletions

View File

@@ -170,7 +170,7 @@ def detect_graphic_elements(
continue
# Skip page-spanning regions
if bw > w * 0.5 or bh > h * 0.5:
if bw > w * 0.6 or bh > h * 0.6:
logger.debug("GraphicDetect PASS1 skip page-spanning (%d,%d) %dx%d", bx, by, bw, bh)
continue
@@ -232,12 +232,16 @@ def detect_graphic_elements(
if color_pixel_count < 200:
continue
# (d) Very low density → thin strokes, almost certainly text
if density < 0.20:
# (d) Very low density → thin strokes, almost certainly text.
# Large regions (photos/illustrations) can have low color density
# because most pixels are grayscale ink. Use a lower threshold
# for regions bigger than 100×80 px.
_min_density = 0.05 if (bw > 100 and bh > 80) else 0.20
if density < _min_density:
logger.info(
"GraphicDetect PASS1 skip low-density (%d,%d) %dx%d "
"density=%.0f%% (likely colored text)",
bx, by, bw, bh, density * 100,
"density=%.0f%% (min=%.0f%%, likely colored text)",
bx, by, bw, bh, density * 100, _min_density * 100,
)
continue

View File

@@ -21,6 +21,7 @@ import numpy as np
from fastapi import APIRouter, HTTPException, Request
from cv_box_detect import detect_boxes, split_page_into_zones
from cv_graphic_detect import detect_graphic_elements
from cv_vocab_types import PageZone
from cv_color_detect import detect_word_colors, recover_colored_text
from cv_ocr_engines import fix_cell_phonetics, fix_ipa_continuation_cell, _text_has_garbled_ipa, _lookup_ipa, _words_to_reading_order_text, _group_words_into_lines
@@ -1469,13 +1470,12 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
session_id, removed, len(exclude_rects),
)
# 2e. Filter words inside detected graphic/image regions
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
# High-confidence words are real text even if they overlap a detected
# graphic region (e.g. colored text that graphic detection couldn't
# fully distinguish from an image).
_GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50
graphic_rects = []
# 2e. Hard-filter words inside graphic/image regions from structure step.
# ALL words inside graphic regions are removed regardless of confidence —
# images cannot contain real text; any OCR words inside are artifacts.
# After image loading (Step 3a) we augment these with freshly detected
# graphic regions from cv_graphic_detect.
graphic_rects: List[Dict[str, int]] = []
if structure_result:
for g in structure_result.get("graphics", []):
graphic_rects.append({
@@ -1484,23 +1484,18 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
})
if graphic_rects:
before = len(all_words)
filtered = []
for w in all_words:
w_cx = w["left"] + w.get("width", 0) / 2
w_cy = w["top"] + w.get("height", 0) / 2
inside = any(
gr["x"] <= w_cx <= gr["x"] + gr["w"]
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
all_words = [
w for w in all_words
if not any(
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in graphic_rects
)
if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD:
continue # remove low-confidence artifact
filtered.append(w)
removed = before - len(filtered)
]
removed = before - len(all_words)
if removed:
all_words = filtered
logger.info(
"build-grid session %s: removed %d low-conf words inside %d graphic region(s)",
"build-grid session %s: hard-removed %d words inside %d structure graphic region(s)",
session_id, removed, len(graphic_rects),
)
@@ -1525,6 +1520,39 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
img_bgr = cv2.imdecode(arr, cv2.IMREAD_COLOR)
if img_bgr is not None:
# --- 3a. Detect graphic/image regions via CV and hard-filter ---
# Pass only significant words (len >= 3) to the detector so that
# short OCR artifacts inside images don't fool the text-vs-graphic
# heuristic (it counts word centroids to distinguish text from images).
sig_words = [w for w in all_words if len((w.get("text") or "").strip()) >= 3]
fresh_graphics = detect_graphic_elements(img_bgr, sig_words)
if fresh_graphics:
fresh_rects = [
{"x": g.x, "y": g.y, "w": g.width, "h": g.height}
for g in fresh_graphics
]
graphic_rects.extend(fresh_rects)
logger.info(
"build-grid session %s: detected %d graphic region(s) via CV",
session_id, len(fresh_graphics),
)
# Hard-filter words inside newly detected graphic regions
before = len(all_words)
all_words = [
w for w in all_words
if not any(
gr["x"] <= w["left"] + w.get("width", 0) / 2 <= gr["x"] + gr["w"]
and gr["y"] <= w["top"] + w.get("height", 0) / 2 <= gr["y"] + gr["h"]
for gr in fresh_rects
)
]
removed = before - len(all_words)
if removed:
logger.info(
"build-grid session %s: hard-removed %d words inside %d fresh graphic region(s)",
session_id, removed, len(fresh_rects),
)
# --- Recover colored text that OCR missed (before grid building) ---
recovered = recover_colored_text(img_bgr, all_words)
if recovered and graphic_rects: