Auto-filter decorative margin strips and header junk
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m45s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 26s
CI / test-go-edu-search (push) Successful in 26s
CI / test-python-klausur (push) Failing after 1m45s
CI / test-python-agent-core (push) Successful in 15s
CI / test-nodejs-website (push) Successful in 15s
- _filter_decorative_margin: Phase 2 now also removes short words (<=3 chars) in the same narrow x-range as the detected single-char strip, catching multi-char OCR artifacts like "Vv" from alphabet graphics. - _filter_header_junk: New filter detects the content start (first row with 3+ high-confidence words) and removes low-conf short fragments above it that are OCR artifacts from header illustrations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -663,11 +663,15 @@ def _filter_decorative_margin(
|
|||||||
character word. These decorative elements are not content and confuse
|
character word. These decorative elements are not content and confuse
|
||||||
column/row detection.
|
column/row detection.
|
||||||
|
|
||||||
Detection criteria:
|
Detection criteria (phase 1 — find the strip using single-char words):
|
||||||
- Words are in the outer 30% of the page (left or right)
|
- Words are in the outer 30% of the page (left or right)
|
||||||
- Nearly all words are single characters (letters or digits)
|
- Nearly all words are single characters (letters or digits)
|
||||||
- At least 8 such words form a vertical strip (≥8 unique Y positions)
|
- At least 8 such words form a vertical strip (≥8 unique Y positions)
|
||||||
- Average horizontal spread of the strip is small (< 60px)
|
- Average horizontal spread of the strip is small (< 80px)
|
||||||
|
|
||||||
|
Phase 2 — once a strip is confirmed, also remove any short word (≤3
|
||||||
|
chars) in the same narrow x-range. This catches multi-char OCR
|
||||||
|
artifacts like "Vv" that belong to the same decorative element.
|
||||||
|
|
||||||
Modifies *words* in place.
|
Modifies *words* in place.
|
||||||
"""
|
"""
|
||||||
@@ -675,7 +679,7 @@ def _filter_decorative_margin(
|
|||||||
return
|
return
|
||||||
|
|
||||||
margin_cutoff = img_w * 0.30
|
margin_cutoff = img_w * 0.30
|
||||||
# Candidate margin words: single char, in left or right 30%
|
# Phase 1: find candidate strips using single-char words
|
||||||
left_strip = [
|
left_strip = [
|
||||||
w for w in words
|
w for w in words
|
||||||
if len((w.get("text") or "").strip()) == 1
|
if len((w.get("text") or "").strip()) == 1
|
||||||
@@ -699,18 +703,34 @@ def _filter_decorative_margin(
|
|||||||
continue
|
continue
|
||||||
# Check horizontal compactness
|
# Check horizontal compactness
|
||||||
x_positions = [w["left"] for w in strip]
|
x_positions = [w["left"] for w in strip]
|
||||||
x_spread = max(x_positions) - min(x_positions)
|
x_min = min(x_positions)
|
||||||
|
x_max = max(x_positions)
|
||||||
|
x_spread = x_max - x_min
|
||||||
if x_spread > 80:
|
if x_spread > 80:
|
||||||
continue
|
continue
|
||||||
# This looks like a decorative alphabet strip — remove these words
|
|
||||||
strip_set = set(id(w) for w in strip)
|
# Phase 2: strip confirmed — also collect short words in same x-range
|
||||||
|
# Expand x-range slightly to catch neighbors (e.g. "Vv" next to "U")
|
||||||
|
strip_x_lo = x_min - 20
|
||||||
|
strip_x_hi = x_max + 60 # word width + tolerance
|
||||||
|
all_strip_words = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) <= 3
|
||||||
|
and strip_x_lo <= w["left"] <= strip_x_hi
|
||||||
|
and (w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
||||||
|
if side == "left"
|
||||||
|
else w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff)
|
||||||
|
]
|
||||||
|
|
||||||
|
strip_set = set(id(w) for w in all_strip_words)
|
||||||
before = len(words)
|
before = len(words)
|
||||||
words[:] = [w for w in words if id(w) not in strip_set]
|
words[:] = [w for w in words if id(w) not in strip_set]
|
||||||
removed = before - len(words)
|
removed = before - len(words)
|
||||||
if removed:
|
if removed:
|
||||||
log.info(
|
log.info(
|
||||||
"build-grid session %s: removed %d decorative %s-margin chars",
|
"build-grid session %s: removed %d decorative %s-margin words "
|
||||||
session_id, removed, side,
|
"(strip x=%d-%d)",
|
||||||
|
session_id, removed, side, strip_x_lo, strip_x_hi,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -744,6 +764,82 @@ def _filter_footer_words(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_header_junk(
|
||||||
|
words: List[Dict],
|
||||||
|
img_h: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Remove OCR junk from header illustrations above the real content.
|
||||||
|
|
||||||
|
Textbook pages often have decorative header graphics (illustrations,
|
||||||
|
icons) that OCR reads as low-confidence junk characters. Real content
|
||||||
|
typically starts further down the page.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. Find the "content start" — the first Y position where a dense
|
||||||
|
horizontal row of 3+ high-confidence words begins.
|
||||||
|
2. Above that line, remove words with conf < 75 and text ≤ 3 chars.
|
||||||
|
These are almost certainly OCR artifacts from illustrations.
|
||||||
|
|
||||||
|
Modifies *words* in place.
|
||||||
|
"""
|
||||||
|
if not words or img_h <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# --- Find content start: first horizontal row with ≥3 high-conf words ---
|
||||||
|
# Sort words by Y
|
||||||
|
sorted_by_y = sorted(words, key=lambda w: w["top"])
|
||||||
|
content_start_y = 0
|
||||||
|
_ROW_TOLERANCE = img_h * 0.02 # words within 2% of page height = same row
|
||||||
|
_MIN_ROW_WORDS = 3
|
||||||
|
_MIN_CONF = 80
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(sorted_by_y):
|
||||||
|
row_y = sorted_by_y[i]["top"]
|
||||||
|
# Collect words in this row band
|
||||||
|
row_words = []
|
||||||
|
j = i
|
||||||
|
while j < len(sorted_by_y) and sorted_by_y[j]["top"] - row_y < _ROW_TOLERANCE:
|
||||||
|
row_words.append(sorted_by_y[j])
|
||||||
|
j += 1
|
||||||
|
# Count high-confidence words with real text (> 1 char)
|
||||||
|
high_conf = [
|
||||||
|
w for w in row_words
|
||||||
|
if w.get("conf", 0) >= _MIN_CONF
|
||||||
|
and len((w.get("text") or "").strip()) > 1
|
||||||
|
]
|
||||||
|
if len(high_conf) >= _MIN_ROW_WORDS:
|
||||||
|
content_start_y = row_y
|
||||||
|
break
|
||||||
|
i = j if j > i else i + 1
|
||||||
|
|
||||||
|
if content_start_y <= 0:
|
||||||
|
return # no clear content start found
|
||||||
|
|
||||||
|
# --- Remove low-conf short junk above content start ---
|
||||||
|
junk = [
|
||||||
|
w for w in words
|
||||||
|
if w["top"] + w.get("height", 0) < content_start_y
|
||||||
|
and w.get("conf", 0) < 75
|
||||||
|
and len((w.get("text") or "").strip()) <= 3
|
||||||
|
]
|
||||||
|
if not junk:
|
||||||
|
return
|
||||||
|
|
||||||
|
junk_set = set(id(w) for w in junk)
|
||||||
|
before = len(words)
|
||||||
|
words[:] = [w for w in words if id(w) not in junk_set]
|
||||||
|
removed = before - len(words)
|
||||||
|
if removed:
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: removed %d header junk words above y=%d "
|
||||||
|
"(content start)",
|
||||||
|
session_id, removed, content_start_y,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Core computation (used by build-grid endpoint and regression tests)
|
# Core computation (used by build-grid endpoint and regression tests)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -792,6 +888,10 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
|||||||
# page number ("64", "S. 12") and not real content.
|
# page number ("64", "S. 12") and not real content.
|
||||||
_filter_footer_words(all_words, img_h, logger, session_id)
|
_filter_footer_words(all_words, img_h, logger, session_id)
|
||||||
|
|
||||||
|
# 2c2. Filter OCR junk from header illustrations.
|
||||||
|
# Low-confidence short fragments above the first real content row.
|
||||||
|
_filter_header_junk(all_words, img_h, logger, session_id)
|
||||||
|
|
||||||
# 2d. Filter words inside user-defined exclude regions (from Structure step).
|
# 2d. Filter words inside user-defined exclude regions (from Structure step).
|
||||||
# These are explicitly marked by the user, so ALL words inside are removed
|
# These are explicitly marked by the user, so ALL words inside are removed
|
||||||
# regardless of confidence.
|
# regardless of confidence.
|
||||||
|
|||||||
Reference in New Issue
Block a user