fix: conservative column detection + smart graphic word filter
Column detection: - Raise MIN_COVERAGE_PRIMARY 20%→35% (prevents false columns in flowing text where random gaps < 35% of rows) - Raise MIN_COVERAGE_SECONDARY 12%→20%, MIN_DISTINCT_ROWS 2→3 - Vocabulary worksheets unaffected (columns appear in >80% of rows) Graphic word filter: - Only remove words with OCR confidence < 50 inside graphic regions - High-confidence words are real text, not image artifacts - Prevents legitimate colored text from being discarded Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -147,10 +147,13 @@ def _cluster_columns_by_alignment(
|
||||
})
|
||||
|
||||
# --- Filter by row coverage ---
|
||||
MIN_COVERAGE_PRIMARY = 0.20
|
||||
MIN_COVERAGE_SECONDARY = 0.12
|
||||
MIN_WORDS_SECONDARY = 3
|
||||
MIN_DISTINCT_ROWS = 2
|
||||
# These thresholds must be high enough to avoid false columns in flowing
|
||||
# text (random inter-word gaps) while still detecting real columns in
|
||||
# vocabulary worksheets (which typically have >80% row coverage).
|
||||
MIN_COVERAGE_PRIMARY = 0.35
|
||||
MIN_COVERAGE_SECONDARY = 0.20
|
||||
MIN_WORDS_SECONDARY = 4
|
||||
MIN_DISTINCT_ROWS = 3
|
||||
|
||||
# Content boundary for left-margin detection
|
||||
content_x_min = min(w["left"] for w in words)
|
||||
@@ -694,6 +697,11 @@ async def build_grid(session_id: str):
|
||||
session_id, len(all_words), len(word_result["cells"]))
|
||||
|
||||
# 2b. Filter words inside detected graphic/image regions
|
||||
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
|
||||
# High-confidence words are real text even if they overlap a detected
|
||||
# graphic region (e.g. colored text that graphic detection couldn't
|
||||
# fully distinguish from an image).
|
||||
_GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50
|
||||
structure_result = session.get("structure_result")
|
||||
graphic_rects = []
|
||||
if structure_result:
|
||||
@@ -713,13 +721,14 @@ async def build_grid(session_id: str):
|
||||
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
|
||||
for gr in graphic_rects
|
||||
)
|
||||
if not inside:
|
||||
filtered.append(w)
|
||||
if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD:
|
||||
continue # remove low-confidence artifact
|
||||
filtered.append(w)
|
||||
removed = before - len(filtered)
|
||||
if removed:
|
||||
all_words = filtered
|
||||
logger.info(
|
||||
"build-grid session %s: removed %d words inside %d graphic region(s)",
|
||||
"build-grid session %s: removed %d low-conf words inside %d graphic region(s)",
|
||||
session_id, removed, len(graphic_rects),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user