Compare commits

...

2 Commits

Author SHA1 Message Date
Benjamin Admin
1fae39dbb8 fix: lower secondary column threshold + strip pipe chars from word_boxes
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 35s
CI / test-go-edu-search (push) Successful in 29s
CI / test-python-klausur (push) Failing after 1m57s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 18s
Dictionary pages have 2 dictionary columns, each with article + headword
sub-columns. The right article column (die/der at x≈626) had only 14.3%
row coverage — below the 20% secondary threshold. Lowered to 12% so
dictionary article columns qualify. Also strip pipe characters from
individual word_box text (not just cell text) to remove OCR syllable
separation marks (e.g. "zu|trau|en" → "zutrauen").

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 07:44:03 +01:00
Benjamin Admin
46c8c28d34 fix: border strip pre-filter + 3-column detection for vocabulary tables
The border strip filter (Step 4e) used the LARGEST x-gap which incorrectly
removed base words along with edge artifacts. Now uses a two-stage approach:
1. _filter_border_strip_words() pre-filters raw words BEFORE column detection,
   scanning from the page edge inward to find the FIRST significant gap (>30px)
2. Step 4e runs as fallback only when pre-filter didn't apply

Session 4233 now correctly detects 3 columns (base word | oder | synonyms)
instead of 2. Threshold raised from 15% to 20% to handle pages with many
edge artifacts. All 4 ground-truth sessions pass regression.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 21:01:43 +01:00
2 changed files with 219 additions and 120 deletions

View File

@@ -14,7 +14,7 @@ DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
import logging
import re
import time
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import cv2
import numpy as np
@@ -40,6 +40,60 @@ router = APIRouter(prefix="/api/v1/ocr-pipeline", tags=["grid-editor"])
# Helpers
# ---------------------------------------------------------------------------
def _filter_border_strip_words(words: List[Dict]) -> Tuple[List[Dict], int]:
"""Remove page-border decoration strip words BEFORE column detection.
Scans from each page edge inward to find the first significant x-gap
(>30 px). If the edge cluster contains <15 % of total words, those
words are removed as border-strip artifacts (alphabet letters,
illustration fragments).
Must run BEFORE ``_build_zone_grid`` so that column detection only
sees real content words and doesn't produce inflated row counts.
"""
if len(words) < 10:
return words, 0
sorted_words = sorted(words, key=lambda w: w.get("left", 0))
total = len(sorted_words)
# -- Left-edge scan (running max right-edge) --
left_count = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
sorted_words[gi].get("left", 0) + sorted_words[gi].get("width", 0),
)
if sorted_words[gi + 1].get("left", 0) - running_right > 30:
left_count = gi + 1
break
# -- Right-edge scan (running min left) --
right_count = 0
running_left = sorted_words[-1].get("left", 0)
for gi in range(total - 1, 0, -1):
running_left = min(running_left, sorted_words[gi].get("left", 0))
prev_right = (
sorted_words[gi - 1].get("left", 0)
+ sorted_words[gi - 1].get("width", 0)
)
if running_left - prev_right > 30:
right_count = total - gi
break
strip_ids: set = set()
if left_count > 0 and left_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[:left_count]}
elif right_count > 0 and right_count / total < 0.20:
strip_ids = {id(w) for w in sorted_words[total - right_count :]}
if not strip_ids:
return words, 0
return [w for w in words if id(w) not in strip_ids], len(strip_ids)
def _cluster_columns_by_alignment(
words: List[Dict],
zone_w: int,
@@ -153,7 +207,7 @@ def _cluster_columns_by_alignment(
# text (random inter-word gaps) while still detecting real columns in
# vocabulary worksheets (which typically have >80% row coverage).
MIN_COVERAGE_PRIMARY = 0.35
MIN_COVERAGE_SECONDARY = 0.20
MIN_COVERAGE_SECONDARY = 0.12
MIN_WORDS_SECONDARY = 4
MIN_DISTINCT_ROWS = 3
@@ -1447,6 +1501,7 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
zones_data: List[Dict[str, Any]] = []
boxes_detected = 0
recovered_count = 0
border_prefiltered = False
img_bgr = None
content_x, content_y, content_w, content_h = _get_content_bounds(all_words)
@@ -1591,6 +1646,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid: filtered %d words inside image overlays from zone %d",
ov_removed, pz.index,
)
zone_words, bs_removed = _filter_border_strip_words(zone_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid: pre-filtered %d border-strip words from zone %d",
bs_removed, pz.index,
)
grid = _build_zone_grid(
zone_words, pz.x, pz.y, pz.width, pz.height,
pz.index, img_w, img_h,
@@ -1728,6 +1790,16 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
"build-grid session %s: filtered %d recovered artifacts (fallback zone)",
session_id, removed,
)
# Pre-filter border-strip words so column detection is not
# confused by edge artifacts. When this removes words, Step 4e
# is skipped (it would otherwise re-detect content as a "strip").
filtered_words, bs_removed = _filter_border_strip_words(filtered_words)
if bs_removed:
border_prefiltered = True
logger.info(
"build-grid session %s: pre-filtered %d border-strip words",
session_id, bs_removed,
)
grid = _build_zone_grid(
filtered_words, content_x, content_y, content_w, content_h,
0, img_w, img_h,
@@ -1884,10 +1956,14 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
removed_pipes, z.get("zone_index", 0),
)
# Also strip leading/trailing pipe chars from cell text that may remain
# from word_boxes that contained mixed text like "word|" or "|word".
# Also strip pipe chars from word_box text and cell text that may remain
# from OCR reading syllable-separation marks (e.g. "zu|trau|en" → "zutrauen").
for z in zones_data:
for cell in z.get("cells", []):
for wb in cell.get("word_boxes", []):
wbt = wb.get("text", "")
if "|" in wbt:
wb["text"] = wbt.replace("|", "")
text = cell.get("text", "")
if "|" in text:
cleaned = text.replace("|", "").strip()
@@ -1895,48 +1971,78 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
cell["text"] = cleaned
# 4e. Detect and remove page-border decoration strips.
# Some textbooks have decorative alphabet strips along the page edge
# (coloured letters, illustrations). OCR picks up scattered letters
# from these as artifacts. Detection: find a significant x-gap
# (>30 px) between a small cluster of word_boxes near the page edge
# and the main content block.
# Skipped when the pre-filter already removed border words BEFORE
# column detection — re-running would incorrectly detect the
# leftmost content column as a "strip".
border_strip_removed = 0
if border_prefiltered:
logger.info("Step 4e: skipped (border pre-filter already applied)")
else:
# Some textbooks have decorative alphabet strips along the page
# edge. OCR picks up scattered letters from these as artifacts.
# Detection: find the first significant x-gap (>30 px) from each
# page edge between a small cluster (<20 %) and the main content.
for z in zones_data:
cells = z.get("cells", [])
if not cells:
continue
# Collect all word_boxes with their cell reference
all_wbs_with_cell: List[tuple] = [] # (left, wb, cell)
for cell in cells:
for wb in cell.get("word_boxes") or []:
all_wbs_with_cell.append((wb.get("left", 0), wb, cell))
if len(all_wbs_with_cell) < 10:
continue
# Sort by x and find the largest gap
all_wbs_with_cell.sort(key=lambda t: t[0])
best_gap = 0
best_gap_idx = -1
for gi in range(len(all_wbs_with_cell) - 1):
right_edge = all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0)
gap = all_wbs_with_cell[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_gap_idx = gi
if best_gap < 30 or best_gap_idx < 0:
continue
left_count = best_gap_idx + 1
right_count = len(all_wbs_with_cell) - left_count
total = len(all_wbs_with_cell)
# The border strip is the SMALLER side with < 15% of total
if left_count < right_count and left_count / total < 0.15:
# -- Left-edge scan --
left_strip_count = 0
left_gap = 0
running_right = 0
for gi in range(total - 1):
running_right = max(
running_right,
all_wbs_with_cell[gi][0] + all_wbs_with_cell[gi][1].get("width", 0),
)
gap = all_wbs_with_cell[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
left_gap = gap
break
# -- Right-edge scan --
right_strip_count = 0
right_gap = 0
running_left = all_wbs_with_cell[-1][0]
for gi in range(total - 1, 0, -1):
running_left = min(running_left, all_wbs_with_cell[gi][0])
prev_right = (
all_wbs_with_cell[gi - 1][0]
+ all_wbs_with_cell[gi - 1][1].get("width", 0)
)
gap = running_left - prev_right
if gap > 30:
right_strip_count = total - gi
right_gap = gap
break
strip_wbs: set = set()
strip_side = ""
strip_gap = 0
strip_count = 0
if left_strip_count > 0 and left_strip_count / total < 0.20:
strip_side = "left"
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_count]}
elif right_count < left_count and right_count / total < 0.15:
strip_count = left_strip_count
strip_gap = left_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[:left_strip_count]}
elif right_strip_count > 0 and right_strip_count / total < 0.20:
strip_side = "right"
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[left_count:]}
else:
strip_count = right_strip_count
strip_gap = right_gap
strip_wbs = {id(t[1]) for t in all_wbs_with_cell[total - right_strip_count:]}
if not strip_wbs:
continue
# Remove strip word_boxes from cells
for cell in cells:
wbs = cell.get("word_boxes") or []
filtered = [wb for wb in wbs if id(wb) not in strip_wbs]
@@ -1944,14 +2050,13 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
border_strip_removed += len(wbs) - len(filtered)
cell["word_boxes"] = filtered
cell["text"] = _words_to_reading_order_text(filtered)
# Remove cells that became empty
z["cells"] = [c for c in cells
if (c.get("word_boxes") or c.get("text", "").strip())]
logger.info(
"Step 4e: removed %d border-strip word_boxes (%s) from zone %d "
"(gap=%dpx, strip=%d/%d wbs)",
border_strip_removed, strip_side, z.get("zone_index", 0),
best_gap, left_count if strip_side == "left" else right_count, total,
strip_gap, strip_count, total,
)
# 5. Color annotation on final word_boxes in cells

View File

@@ -1109,64 +1109,56 @@ class TestBorderStripFilter:
def test_left_border_strip_removed(self):
"""Word_boxes at x<120 with 45px gap to content at x>=179 are removed."""
# Simulate border strip (11 wbs) + real content (20 wbs)
# Simulate border strip (3 wbs) + base words (7 wbs) + oder (7 wbs)
# + synonyms (20 wbs). The old "largest gap" algorithm would pick
# the 67px gap between base words and "oder", removing base words.
# The new "first gap from edge" algorithm picks the 45px gap between
# border artifacts and base words.
border_wbs = [
self._make_wb("M", 49, 436, 46, 44),
self._make_wb("x", 113, 610, 21, 38),
self._make_wb("Er", 45, 998, 62, 37),
self._make_wb("M", 49, 436, 46, 44), # right=95
self._make_wb("x", 113, 610, 21, 38), # right=134
self._make_wb("Er", 45, 998, 62, 37), # right=107
]
content_wbs = []
for i in range(20):
# Place content words at x=179 and x=280 (gap=1px between them,
# much smaller than the 45px border-to-content gap)
content_wbs.append(self._make_wb(f"word{i}", 179 + (i % 2) * 51, 100 + i * 40))
# Build zone with cells
cells = []
# Border-only cells
for i, wb in enumerate(border_wbs):
cells.append({"cell_id": f"R{i}_C0", "col_index": 0, "row_index": i,
"word_boxes": [wb], "text": wb["text"]})
# Content cells
for i, wb in enumerate(content_wbs):
ri = len(border_wbs) + i
cells.append({"cell_id": f"R{ri}_C1", "col_index": 1, "row_index": ri,
"word_boxes": [wb], "text": wb["text"]})
zone = {"zone_index": 0, "zone_type": "content", "cells": cells,
"columns": [], "rows": []}
# The filter runs inside _build_grid_core, but we can test the
# pattern detection logic: 3 border wbs + 20 content wbs,
# border right edge = 113+21=134, content left = 179, gap = 45px
# 3/23 = 13% < 15% threshold
from cv_ocr_engines import _group_words_into_lines
all_left = sorted(
[(wb["left"], wb) for cell in cells for wb in cell.get("word_boxes", [])],
key=lambda t: t[0]
)
# Find largest gap
best_gap = 0
best_idx = -1
for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"]
gap = all_left[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_idx = gi
assert best_gap >= 30, f"Gap should be >=30, got {best_gap}"
left_count = best_idx + 1
base_wbs = [self._make_wb(f"base{i}", 179, 100 + i * 60, 100, 20) for i in range(7)]
oder_wbs = [self._make_wb("oder", 379, 100 + i * 60, 68, 20) for i in range(7)]
synonym_wbs = [self._make_wb(f"syn{i}", 452 + (i % 5) * 30, 100 + (i // 5) * 60, 80, 20) for i in range(20)]
all_wbs = border_wbs + base_wbs + oder_wbs + synonym_wbs
all_left = sorted([(wb["left"], wb) for wb in all_wbs], key=lambda t: t[0])
total = len(all_left)
assert left_count / total < 0.15, f"Border ratio {left_count}/{total} should be <15%"
# New algorithm: scan from left edge, find FIRST gap >30px
running_right = 0
left_strip_count = 0
for gi in range(total - 1):
running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
break
# Should find the 45px gap between border (right=134) and base (left=179)
assert left_strip_count == len(border_wbs), (
f"Expected {len(border_wbs)} border wbs, got {left_strip_count}"
)
assert left_strip_count / total < 0.20, (
f"Border ratio {left_strip_count}/{total} should be <20%"
)
def test_no_removal_when_no_gap(self):
"""No gap > 30px between word_boxes → nothing removed."""
# Words spaced 20px apart with width 50 → overlap, no gap >30px
wbs = [self._make_wb(f"w{i}", 10 + i * 20, 100) for i in range(15)]
all_left = sorted([(wb["left"], wb) for wb in wbs], key=lambda t: t[0])
best_gap = 0
running_right = 0
found_gap = False
for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"]
gap = all_left[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
assert best_gap < 30, f"No significant gap expected, got {best_gap}"
running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - running_right
if gap > 30:
found_gap = True
break
assert not found_gap, "No significant gap expected"
def test_equal_sides_not_removed(self):
"""Two roughly equal groups (50/50) are NOT treated as border strip."""
@@ -1176,15 +1168,17 @@ class TestBorderStripFilter:
[(wb["left"], wb) for wb in left_wbs + right_wbs],
key=lambda t: t[0]
)
best_gap = 0
best_idx = -1
for gi in range(len(all_left) - 1):
right_edge = all_left[gi][0] + all_left[gi][1]["width"]
gap = all_left[gi + 1][0] - right_edge
if gap > best_gap:
best_gap = gap
best_idx = gi
left_count = best_idx + 1
total = len(all_left)
# Left scan: first gap >30px from left
running_right = 0
left_strip_count = 0
for gi in range(total - 1):
running_right = max(running_right, all_left[gi][0] + all_left[gi][1]["width"])
gap = all_left[gi + 1][0] - running_right
if gap > 30:
left_strip_count = gi + 1
break
# 10/20 = 50% — NOT below 15% threshold, so no removal
assert left_count / total >= 0.15, "Equal groups should NOT trigger border removal"
assert left_strip_count == 0 or left_strip_count / total >= 0.20, (
"Equal groups should NOT trigger border removal"
)