fix: border strip pre-filter + 3-column detection for vocabulary tables

The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly
removed base words along with edge artifacts. Now uses a two-stage approach:
1. _filter_border_strip_words() pre-filters raw words BEFORE column detection,
   scanning from the page edge inward to find the FIRST significant gap (>30px)
2. Step 4e runs as fallback only when pre-filter didn't apply

Session 4233 now correctly detects 3 columns (base word | oder | synonyms)
instead of 2. Threshold raised from 15% to 20% to handle pages with many
edge artifacts. All 4 ground-truth sessions pass regression.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 21:01:43 +01:00
parent 4000110501
commit 46c8c28d34
2 changed files with 212 additions and 117 deletions

View File

@@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging
import re
import time
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
@@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
# Helpers
# ---------------------------------------------------------------------------
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
"""Remove page-border decoration strip words BEFORE column detection.
Scans from each page edge inward to find the first significant x-gap
(>30 px). If the edge cluster contains <15 % of total words, those
words are removed as border-strip artifacts (alphabet letters,
illustration fragments).
Must run BEFORE ``_build_zone_grid`` so that column detection only
sees real content words and doesn't produce inflated row counts.
"""
if len(words) < 10:
return words, 0
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
total = len(sorted_words)
# -- Left-edge scan (running max right-edge) --
left_count = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
)
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
left_count = gi + 1
break
# -- Right-edge scan (running min left) --
right_count = 0
running_left = sorted_words[-1].get("left", 0)
for gi in range(total - 1, 0, -1):
running_left = min(running_left, sorted_words[gi].get("left", 0))
prev_right = (
sorted_words[gi - 1].get("left", 0)
+ sorted_words[gi - 1].get("width", 0)
)
if running_left - prev_right > 30:
right_count = total - gi
break
strip_ids: set = set()
if left_count > 0 and left_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[:left_count]}
elif right_count > 0 and right_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[total - right_count :]}
if not strip_ids:
return words, 0
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
def _cluster_columns_by_alignment(
words: List[Dict],
zone_w: int,
@@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
zones_data: List[Dict[str, Any]] = []
boxes_detected = 0
recovered_count = 0
border_prefiltered = False
img_bgr = None
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
@@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index,
)
zone_words, bs_removed = _filter_border_strip_words(zone_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid: pre-filtered %d border-strip words from zone %d",
bs_removed, pz.index,
)
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
@@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed,
)
# Pre-filter border-strip words so column detection is not
# confused by edge artifacts. When this removes words, Step 4e
# is skipped (it would otherwise re-detect content as a "strip").
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid session %s: pre-filtered %d border-strip words",
session_id, bs_removed,
)
grid = _build_zone_grid(
filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
@@ -1895,64 +1967,93 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
cell["text"] = cleaned
# 4e. Detect and remove page-border decoration strips.
# Some textbooks have decorative alphabet strips along the page edge
# (coloured letters, illustrations). OCR picks up scattered letters
# from these as artifacts. Detection: find a significant x-gap
# (>30 px) between a small cluster of word_boxes near the page edge
# and the main content block.
# Skipped when the pre-filter already removed border words BEFORE
# column detection — re-running would incorrectly detect the
# leftmost content column as a "strip".
border_strip_removed = 0
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
# Collect all word_boxes with their cell reference
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
# Sort by x and find the largest gap
all_wbs_with_cell.sort(key=lambda t: t[0])
best_gap = 0
best_gap_idx = -1
for gi in range(len(all_wbs_with_cell) - 1):
right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
gap = all_wbs_with_cell[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_gap_idx = gi
if best_gap < 30 or best_gap_idx < 0:
continue
left_count = best_gap_idx + 1
right_count = len(all_wbs_with_cell) - left_count
total = len(all_wbs_with_cell)
# The border strip is the SMALLER side with < 15% of total
if left_count < right_count and left_count / total < 0.15:
strip_side = "left"
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
elif right_count < left_count and right_count / total < 0.15:
strip_side = "right"
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
else:
continue
# Remove strip word_boxes from cells
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
# Remove cells that became empty
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
best_gap, left_count if strip_side == "left" else right_count, total,
)
if border_prefiltered:
logger.info("Step 4e: skipped (border pre-filter already applied)")
else:
# Some textbooks have decorative alphabet strips along the page
# edge. OCR picks up scattered letters from these as artifacts.
# Detection: find the first significant x-gap (>30 px) from each
# page edge between a small cluster (<20 %) and the main content.
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
all_wbs_with_cell.sort(key=lambda t: t[0])
total = len(all_wbs_with_cell)
# -- Left-edge scan --
left_strip_count = 0
left_gap = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
)
gap = all_wbs_with_cell[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
left_gap = gap
break
# -- Right-edge scan --
right_strip_count = 0
right_gap = 0
running_left = all_wbs_with_cell[-1][0]
for gi in range(total - 1, 0, -1):
running_left = min(running_left, all_wbs_with_cell[gi][0])
prev_right = (
all_wbs_with_cell[gi - 1][0]
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
)
gap = running_left - prev_right
if gap > 30:
right_strip_count = total - gi
right_gap = gap
break
strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
if len(filtered) < len(wbs):
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
strip_gap, strip_count, total,
)
# 5. Color annotation on final word_boxes in cells
if img_bgr is not None: