Fix 3 grid issues: artifact cells, connector col noise, footer false positive

1. Add per-cell artifact filter (4b2): removes single-word cells with ≤2 chars and confidence <65 (e.g. "as" from stray OCR marks) 2. Add narrow connector column normalization (4d2): when ≥60% of cells in a column share the same short text (e.g. "oder"), normalize near-match outliers like "oderb" → "oder" 3. Fix footer detection: require short text (≤20 chars) and no commas. Comma-separated lists like "Uhrzeit, Vergangenheit, Zukunft" are content continuations, not page numbers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 08:18:55 +01:00
parent 525de55791
commit 5af5d821a5
1 changed files with 84 additions and 3 deletions
@@ -631,6 +631,34 @@ async def _build_grid_core(
                sorted(junk_row_indices),
            )

+    # 4b2. Remove individual cells that consist of a single very-short,
+    # low-confidence word (OCR artifacts like "as", "b" from stray marks).
+    # These survive row-level junk removal when the row has valid cells
+    # in other columns.
+    _ARTIFACT_MAX_LEN = 2
+    _ARTIFACT_CONF_THRESHOLD = 65
+    for z in zones_data:
+        cells = z.get("cells", [])
+        if not cells:
+            continue
+        artifact_ids = set()
+        for cell in cells:
+            wbs = cell.get("word_boxes") or []
+            if len(wbs) != 1:
+                continue
+            wb = wbs[0]
+            text = (wb.get("text") or "").strip()
+            conf = wb.get("conf", 100)
+            if len(text) <= _ARTIFACT_MAX_LEN and conf < _ARTIFACT_CONF_THRESHOLD:
+                artifact_ids.add(cell.get("cell_id"))
+        if artifact_ids:
+            z["cells"] = [c for c in cells if c.get("cell_id") not in artifact_ids]
+            logger.info(
+                "build-grid: removed %d artifact cells from zone %d: %s",
+                len(artifact_ids), z.get("zone_index", 0),
+                [c.get("text") for c in cells if c.get("cell_id") in artifact_ids],
+            )
+
    # 4c. Remove oversized word_boxes from individual cells.
    # OCR artifacts from graphics/images (e.g. a huge "N" from a map image)
    # have word heights 3-5x the median.  Remove them per-word so they don't
@@ -707,6 +735,52 @@ async def _build_grid_core(
                if cleaned != text.strip():
                    cell["text"] = cleaned

+    # 4d2. Normalize narrow connector columns.
+    # In synonym dictionaries a narrow column repeats the same word
+    # (e.g. "oder") in every row.  OCR sometimes appends noise chars
+    # (e.g. "oderb" instead of "oder").  If ≥60% of cells in a column
+    # share the same short text, normalize near-match outliers.
+    for z in zones_data:
+        cols = z.get("columns", [])
+        cells = z.get("cells", [])
+        if not cols or not cells:
+            continue
+        for col in cols:
+            ci = col.get("index")
+            col_cells = [c for c in cells if c.get("col_index") == ci]
+            if len(col_cells) < 3:
+                continue
+            # Count text occurrences
+            text_counts: Dict[str, int] = {}
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t:
+                    text_counts[t] = text_counts.get(t, 0) + 1
+            if not text_counts:
+                continue
+            dominant_text = max(text_counts, key=text_counts.get)  # type: ignore[arg-type]
+            dominant_count = text_counts[dominant_text]
+            # Only normalize if dominant word is short and appears in ≥60%
+            if len(dominant_text) > 10 or dominant_count < len(col_cells) * 0.6:
+                continue
+            # Fix outliers that start with the dominant text
+            fixed = 0
+            for c in col_cells:
+                t = (c.get("text") or "").strip()
+                if t != dominant_text and t.startswith(dominant_text) and len(t) <= len(dominant_text) + 2:
+                    c["text"] = dominant_text
+                    # Also fix word_boxes
+                    wbs = c.get("word_boxes") or []
+                    if len(wbs) == 1:
+                        wbs[0]["text"] = dominant_text
+                    fixed += 1
+            if fixed:
+                logger.info(
+                    "build-grid: normalized %d outlier cells in connector column %d "
+                    "(dominant='%s') zone %d",
+                    fixed, ci, dominant_text, z.get("zone_index", 0),
+                )
+
    # 4e. Detect and remove page-border decoration strips.
    # Skipped when the pre-filter already removed border words BEFORE
    # column detection — re-running would incorrectly detect the
@@ -1095,8 +1169,9 @@ async def _build_grid_core(
                          if c.get("cell_id") not in page_ref_cell_ids]

        # Detect footer: last non-header row if it has only 1 cell
-        # and the text is NOT IPA (no real IPA Unicode symbols).
-        # This catches page numbers like "two hundred and twelve".
+        # with short, non-content text (page numbers like "233" or
+        # "two hundred and twelve").  Comma-separated lists and long
+        # text are content continuations, not page numbers.
        footer_rows = []
        non_header_rows = [r for r in rows if not r.get("is_header")]
        if non_header_rows:
@@ -1108,7 +1183,13 @@ async def _build_grid_core(
                text = (last_cells[0].get("text") or "").strip()
                # Not IPA (no real IPA symbols) and not a heading
                has_real_ipa = any(c in _REAL_IPA_CHARS_SET for c in text)
-                if text and not has_real_ipa and last_cells[0].get("col_type") != "heading":
+                # Comma-separated text is a content continuation, not a footer
+                has_commas = ',' in text
+                # Long text (>20 chars) is unlikely a page number
+                is_short = len(text) <= 20
+                if (text and not has_real_ipa and not has_commas
+                        and is_short
+                        and last_cells[0].get("col_type") != "heading"):
                    footer_rows.append({
                        "row_index": last_ri,
                        "text": text,