fix: border strip pre-filter + 3-column detection for vocabulary tables
The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly removed base words along with edge artifacts. Now uses a two-stage approach: 1. _filter_border_strip_words() pre-filters raw words BEFORE column detection, scanning from the page edge inward to find the FIRST significant gap (>30px) 2. Step 4e runs as fallback only when pre-filter didn't apply Session 4233 now correctly detects 3 columns (base word | oder | synonyms) instead of 2. Threshold raised from 15% to 20% to handle pages with many edge artifacts. All 4 ground-truth sessions pass regression. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
@@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
|
||||
"""Remove page-border decoration strip words BEFORE column detection.
|
||||
|
||||
Scans from each page edge inward to find the first significant x-gap
|
||||
(>30 px). If the edge cluster contains <15 % of total words, those
|
||||
words are removed as border-strip artifacts (alphabet letters,
|
||||
illustration fragments).
|
||||
|
||||
Must run BEFORE ``_build_zone_grid`` so that column detection only
|
||||
sees real content words and doesn't produce inflated row counts.
|
||||
"""
|
||||
if len(words) < 10:
|
||||
return words, 0
|
||||
|
||||
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
|
||||
total = len(sorted_words)
|
||||
|
||||
# -- Left-edge scan (running max right-edge) --
|
||||
left_count = 0
|
||||
running_right = 0
|
||||
for gi in range(total - 1):
|
||||
running_right = max(
|
||||
running_right,
|
||||
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
|
||||
)
|
||||
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
|
||||
left_count = gi + 1
|
||||
break
|
||||
|
||||
# -- Right-edge scan (running min left) --
|
||||
right_count = 0
|
||||
running_left = sorted_words[-1].get("left", 0)
|
||||
for gi in range(total - 1, 0, -1):
|
||||
running_left = min(running_left, sorted_words[gi].get("left", 0))
|
||||
prev_right = (
|
||||
sorted_words[gi - 1].get("left", 0)
|
||||
+ sorted_words[gi - 1].get("width", 0)
|
||||
)
|
||||
if running_left - prev_right > 30:
|
||||
right_count = total - gi
|
||||
break
|
||||
|
||||
strip_ids: set = set()
|
||||
if left_count > 0 and left_count / total < 0.20:
|
||||
strip_ids = {id(w) for w in sorted_words[:left_count]}
|
||||
elif right_count > 0 and right_count / total < 0.20:
|
||||
strip_ids = {id(w) for w in sorted_words[total - right_count :]}
|
||||
|
||||
if not strip_ids:
|
||||
return words, 0
|
||||
|
||||
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
|
||||
|
||||
|
||||
def _cluster_columns_by_alignment(
|
||||
words: List[Dict],
|
||||
zone_w: int,
|
||||
@@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
zones_data: List[Dict[str, Any]] = []
|
||||
boxes_detected = 0
|
||||
recovered_count = 0
|
||||
border_prefiltered = False
|
||||
img_bgr = None
|
||||
|
||||
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
|
||||
@@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
"build-grid: filtered %d words inside image overlays from zone %d",
|
||||
ov_removed, pz.index,
|
||||
)
|
||||
zone_words, bs_removed = _filter_border_strip_words(zone_words)
|
||||
if bs_removed:
|
||||
border_prefiltered = True
|
||||
logger.info(
|
||||
"build-grid: pre-filtered %d border-strip words from zone %d",
|
||||
bs_removed, pz.index,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
zone_words, pz.x, pz.y, pz.width, pz.height,
|
||||
pz.index, img_w, img_h,
|
||||
@@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
|
||||
session_id, removed,
|
||||
)
|
||||
# Pre-filter border-strip words so column detection is not
|
||||
# confused by edge artifacts. When this removes words, Step 4e
|
||||
# is skipped (it would otherwise re-detect content as a "strip").
|
||||
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
|
||||
if bs_removed:
|
||||
border_prefiltered = True
|
||||
logger.info(
|
||||
"build-grid session %s: pre-filtered %d border-strip words",
|
||||
session_id, bs_removed,
|
||||
)
|
||||
grid = _build_zone_grid(
|
||||
filtered_words, content_x, content_y, content_w, content_h,
|
||||
0, img_w, img_h,
|
||||
@@ -1895,64 +1967,93 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
cell["text"] = cleaned
|
||||
|
||||
# 4e. Detect and remove page-border decoration strips.
|
||||
# Some textbooks have decorative alphabet strips along the page edge
|
||||
# (coloured letters, illustrations). OCR picks up scattered letters
|
||||
# from these as artifacts. Detection: find a significant x-gap
|
||||
# (>30 px) between a small cluster of word_boxes near the page edge
|
||||
# and the main content block.
|
||||
# Skipped when the pre-filter already removed border words BEFORE
|
||||
# column detection — re-running would incorrectly detect the
|
||||
# leftmost content column as a "strip".
|
||||
border_strip_removed = 0
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
# Collect all word_boxes with their cell reference
|
||||
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
|
||||
for cell in cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
||||
if len(all_wbs_with_cell) < 10:
|
||||
continue
|
||||
# Sort by x and find the largest gap
|
||||
all_wbs_with_cell.sort(key=lambda t: t[0])
|
||||
best_gap = 0
|
||||
best_gap_idx = -1
|
||||
for gi in range(len(all_wbs_with_cell) - 1):
|
||||
right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
|
||||
gap = all_wbs_with_cell[gi + 1][0] - right_edge
|
||||
if gap > best_gap:
|
||||
best_gap = gap
|
||||
best_gap_idx = gi
|
||||
if best_gap < 30 or best_gap_idx < 0:
|
||||
continue
|
||||
left_count = best_gap_idx + 1
|
||||
right_count = len(all_wbs_with_cell) - left_count
|
||||
total = len(all_wbs_with_cell)
|
||||
# The border strip is the SMALLER side with < 15% of total
|
||||
if left_count < right_count and left_count / total < 0.15:
|
||||
strip_side = "left"
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
|
||||
elif right_count < left_count and right_count / total < 0.15:
|
||||
strip_side = "right"
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
|
||||
else:
|
||||
continue
|
||||
# Remove strip word_boxes from cells
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
||||
if len(filtered) < len(wbs):
|
||||
border_strip_removed += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
# Remove cells that became empty
|
||||
z["cells"] = [c for c in cells
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
||||
"(gap=%dpx, strip=%d/%d wbs)",
|
||||
border_strip_removed, strip_side, z.get("zone_index", 0),
|
||||
best_gap, left_count if strip_side == "left" else right_count, total,
|
||||
)
|
||||
if border_prefiltered:
|
||||
logger.info("Step 4e: skipped (border pre-filter already applied)")
|
||||
else:
|
||||
# Some textbooks have decorative alphabet strips along the page
|
||||
# edge. OCR picks up scattered letters from these as artifacts.
|
||||
# Detection: find the first significant x-gap (>30 px) from each
|
||||
# page edge between a small cluster (<20 %) and the main content.
|
||||
for z in zones_data:
|
||||
cells = z.get("cells", [])
|
||||
if not cells:
|
||||
continue
|
||||
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
|
||||
for cell in cells:
|
||||
for wb in cell.get("word_boxes") or []:
|
||||
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
|
||||
if len(all_wbs_with_cell) < 10:
|
||||
continue
|
||||
all_wbs_with_cell.sort(key=lambda t: t[0])
|
||||
total = len(all_wbs_with_cell)
|
||||
|
||||
# -- Left-edge scan --
|
||||
left_strip_count = 0
|
||||
left_gap = 0
|
||||
running_right = 0
|
||||
for gi in range(total - 1):
|
||||
running_right = max(
|
||||
running_right,
|
||||
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
|
||||
)
|
||||
gap = all_wbs_with_cell[gi + 1][0] - running_right
|
||||
if gap > 30:
|
||||
left_strip_count = gi + 1
|
||||
left_gap = gap
|
||||
break
|
||||
|
||||
# -- Right-edge scan --
|
||||
right_strip_count = 0
|
||||
right_gap = 0
|
||||
running_left = all_wbs_with_cell[-1][0]
|
||||
for gi in range(total - 1, 0, -1):
|
||||
running_left = min(running_left, all_wbs_with_cell[gi][0])
|
||||
prev_right = (
|
||||
all_wbs_with_cell[gi - 1][0]
|
||||
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
|
||||
)
|
||||
gap = running_left - prev_right
|
||||
if gap > 30:
|
||||
right_strip_count = total - gi
|
||||
right_gap = gap
|
||||
break
|
||||
|
||||
strip_wbs: set = set()
|
||||
strip_side = ""
|
||||
strip_gap = 0
|
||||
strip_count = 0
|
||||
if left_strip_count > 0 and left_strip_count / total < 0.20:
|
||||
strip_side = "left"
|
||||
strip_count = left_strip_count
|
||||
strip_gap = left_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
|
||||
elif right_strip_count > 0 and right_strip_count / total < 0.20:
|
||||
strip_side = "right"
|
||||
strip_count = right_strip_count
|
||||
strip_gap = right_gap
|
||||
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
|
||||
|
||||
if not strip_wbs:
|
||||
continue
|
||||
for cell in cells:
|
||||
wbs = cell.get("word_boxes") or []
|
||||
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
|
||||
if len(filtered) < len(wbs):
|
||||
border_strip_removed += len(wbs) - len(filtered)
|
||||
cell["word_boxes"] = filtered
|
||||
cell["text"] = _words_to_reading_order_text(filtered)
|
||||
z["cells"] = [c for c in cells
|
||||
if (c.get("word_boxes") or c.get("text", "").strip())]
|
||||
logger.info(
|
||||
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
|
||||
"(gap=%dpx, strip=%d/%d wbs)",
|
||||
border_strip_removed, strip_side, z.get("zone_index", 0),
|
||||
strip_gap, strip_count, total,
|
||||
)
|
||||
|
||||
# 5. Color annotation on final word_boxes in cells
|
||||
if img_bgr is not None:
|
||||
|
||||
Reference in New Issue
Block a user