fix: conservative column detection + smart graphic word filter

Column detection: - Raise MIN_COVERAGE_PRIMARY 20%→35% (prevents false columns in flowing text where random gaps < 35% of rows) - Raise MIN_COVERAGE_SECONDARY 12%→20%, MIN_DISTINCT_ROWS 2→3 - Vocabulary worksheets unaffected (columns appear in >80% of rows) Graphic word filter: - Only remove words with OCR confidence < 50 inside graphic regions - High-confidence words are real text, not image artifacts - Prevents legitimate colored text from being discarded Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 18:19:25 +01:00
parent a079ffe8e9
commit 19b93f7762
1 changed files with 16 additions and 7 deletions
@@ -147,10 +147,13 @@ def _cluster_columns_by_alignment(
    })

    # --- Filter by row coverage ---
-    MIN_COVERAGE_PRIMARY = 0.20
-    MIN_COVERAGE_SECONDARY = 0.12
-    MIN_WORDS_SECONDARY = 3
-    MIN_DISTINCT_ROWS = 2
+    # These thresholds must be high enough to avoid false columns in flowing
+    # text (random inter-word gaps) while still detecting real columns in
+    # vocabulary worksheets (which typically have >80% row coverage).
+    MIN_COVERAGE_PRIMARY = 0.35
+    MIN_COVERAGE_SECONDARY = 0.20
+    MIN_WORDS_SECONDARY = 4
+    MIN_DISTINCT_ROWS = 3

    # Content boundary for left-margin detection
    content_x_min = min(w["left"] for w in words)
@@ -694,6 +697,11 @@ async def build_grid(session_id: str):
                session_id, len(all_words), len(word_result["cells"]))

    # 2b. Filter words inside detected graphic/image regions
+    # Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
+    # High-confidence words are real text even if they overlap a detected
+    # graphic region (e.g. colored text that graphic detection couldn't
+    # fully distinguish from an image).
+    _GRAPHIC_CONF_THRESHOLD = 50  # keep words with conf >= 50
    structure_result = session.get("structure_result")
    graphic_rects = []
    if structure_result:
@@ -713,13 +721,14 @@ async def build_grid(session_id: str):
                and gr["y"] <= w_cy <= gr["y"] + gr["h"]
                for gr in graphic_rects
            )
-            if not inside:
+            if inside and w.get("conf", 0) < _GRAPHIC_CONF_THRESHOLD:
+                continue  # remove low-confidence artifact
            filtered.append(w)
        removed = before - len(filtered)
        if removed:
            all_words = filtered
            logger.info(
-                "build-grid session %s: removed %d words inside %d graphic region(s)",
+                "build-grid session %s: removed %d low-conf words inside %d graphic region(s)",
                session_id, removed, len(graphic_rects),
            )