fix: conservative column detection + smart graphic word filter
Column detection: - Raise MIN_COVERAGE_PRIMARY 20%→35% (prevents false columns in flowing text where random gaps < 35% of rows) - Raise MIN_COVERAGE_SECONDARY 12%→20%, MIN_DISTINCT_ROWS 2→3 - Vocabulary worksheets unaffected (columns appear in >80% of rows) Graphic word filter: - Only remove words with OCR confidence < 50 inside graphic regions - High-confidence words are real text, not image artifacts - Prevents legitimate colored text from being discarded Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -147,10 +147,13 @@ def _cluster_columns_by_alignment(
|
|||||||
})
|
})
|
||||||
|
|
||||||
# --- Filter by row coverage ---
|
# --- Filter by row coverage ---
|
||||||
MIN_COVERAGE_PRIMARY = 0.20
|
# These thresholds must be high enough to avoid false columns in flowing
|
||||||
MIN_COVERAGE_SECONDARY = 0.12
|
# text (random inter-word gaps) while still detecting real columns in
|
||||||
MIN_WORDS_SECONDARY = 3
|
# vocabulary worksheets (which typically have >80% row coverage).
|
||||||
MIN_DISTINCT_ROWS = 2
|
MIN_COVERAGE_PRIMARY = 0.35
|
||||||
|
MIN_COVERAGE_SECONDARY = 0.20
|
||||||
|
MIN_WORDS_SECONDARY = 4
|
||||||
|
MIN_DISTINCT_ROWS = 3
|
||||||
|
|
||||||
# Content boundary for left-margin detection
|
# Content boundary for left-margin detection
|
||||||
content_x_min = min(w["left"] for w in words)
|
content_x_min = min(w["left"] for w in words)
|
||||||
@@ -694,6 +697,11 @@ async def build_grid(session_id: str):
|
|||||||
session_id, len(all_words), len(word_result["cells"]))
|
session_id, len(all_words), len(word_result["cells"]))
|
||||||
|
|
||||||
# 2b. Filter words inside detected graphic/image regions
|
# 2b. Filter words inside detected graphic/image regions
|
||||||
|
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
|
||||||
|
# High-confidence words are real text even if they overlap a detected
|
||||||
|
# graphic region (e.g. colored text that graphic detection couldn't
|
||||||
|
# fully distinguish from an image).
|
||||||
|
_GRAPHIC_CONF_THRESHOLD = 50 # keep words with conf >= 50
|
||||||
structure_result = session.get("structure_result")
|
structure_result = session.get("structure_result")
|
||||||
graphic_rects = []
|
graphic_rects = []
|
||||||
if structure_result:
|
if structure_result:
|
||||||
@@ -713,13 +721,14 @@ async def build_grid(session_id: str):
|
|||||||
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
|
and gr["y"] <= w_cy <= gr["y"] + gr["h"]
|
||||||
for gr in graphic_rects
|
for gr in graphic_rects
|
||||||
)
|
)
|
||||||
if not inside:
|
if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD:
|
||||||
|
continue # remove low-confidence artifact
|
||||||
filtered.append(w)
|
filtered.append(w)
|
||||||
removed = before - len(filtered)
|
removed = before - len(filtered)
|
||||||
if removed:
|
if removed:
|
||||||
all_words = filtered
|
all_words = filtered
|
||||||
logger.info(
|
logger.info(
|
||||||
"build-grid session %s: removed %d words inside %d graphic region(s)",
|
"build-grid session %s: removed %d low-conf words inside %d graphic region(s)",
|
||||||
session_id, removed, len(graphic_rects),
|
session_id, removed, len(graphic_rects),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user