Fix false header detection and add decorative margin/footer filters
- Remove all_colored spanning header heuristic that falsely flagged colored vocabulary entries (Scotland, secondary school) as headers - Add _filter_decorative_margin: removes vertical A-Z alphabet strips along page margins (single-char words in a compact vertical strip) - Add _filter_footer_words: removes page numbers in bottom 5% of page - Tighten spanning header rule: require ≥3 columns spanned + ≤3 words Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -452,11 +452,12 @@ def _detect_header_rows(
|
|||||||
if 0 not in headers:
|
if 0 not in headers:
|
||||||
headers.append(0)
|
headers.append(0)
|
||||||
|
|
||||||
# Spanning header detection: rows with few words that cross column
|
# Spanning header detection: rows with very few words that span
|
||||||
# boundaries and don't fit the normal multi-column pattern.
|
# across many columns (e.g. "Unit 4: Bonnie Scotland" centred
|
||||||
if columns and len(columns) >= 2:
|
# across all columns). Only trigger for clear cases (≥3 cols,
|
||||||
# Typical data row has words in 2+ columns; a spanning header has
|
# ≤3 words) to avoid false positives on vocabulary worksheets
|
||||||
# words that sit in the middle columns without matching the pattern.
|
# where colored entries naturally span 2 columns.
|
||||||
|
if columns and len(columns) >= 3:
|
||||||
for row in rows:
|
for row in rows:
|
||||||
ri = row["index"]
|
ri = row["index"]
|
||||||
if ri in headers:
|
if ri in headers:
|
||||||
@@ -465,26 +466,15 @@ def _detect_header_rows(
|
|||||||
w for w in zone_words
|
w for w in zone_words
|
||||||
if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"]
|
if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"]
|
||||||
]
|
]
|
||||||
if not row_words or len(row_words) > 6:
|
if not row_words or len(row_words) > 3:
|
||||||
continue # too many words to be a header
|
continue
|
||||||
# Check if all row words are colored (common for section headers)
|
|
||||||
all_colored = all(
|
|
||||||
w.get("color_name") and w.get("color_name") != "black"
|
|
||||||
for w in row_words
|
|
||||||
)
|
|
||||||
# Check if words span across the middle columns (not in col 0)
|
|
||||||
word_x_min = min(w["left"] for w in row_words)
|
word_x_min = min(w["left"] for w in row_words)
|
||||||
word_x_max = max(w["left"] + w["width"] for w in row_words)
|
word_x_max = max(w["left"] + w["width"] for w in row_words)
|
||||||
first_col_end = columns[0]["x_max"] if columns else 0
|
|
||||||
# Header if: colored text that starts after the first column
|
|
||||||
# or spans more than 2 columns
|
|
||||||
cols_spanned = sum(
|
cols_spanned = sum(
|
||||||
1 for c in columns
|
1 for c in columns
|
||||||
if word_x_min < c["x_max"] and word_x_max > c["x_min"]
|
if word_x_min < c["x_max"] and word_x_max > c["x_min"]
|
||||||
)
|
)
|
||||||
if all_colored and cols_spanned >= 2:
|
if cols_spanned >= 3 and len(row_words) <= 3:
|
||||||
headers.append(ri)
|
|
||||||
elif cols_spanned >= 3 and len(row_words) <= 4:
|
|
||||||
headers.append(ri)
|
headers.append(ri)
|
||||||
|
|
||||||
return headers
|
return headers
|
||||||
@@ -655,6 +645,100 @@ def _get_content_bounds(words: List[Dict]) -> tuple:
|
|||||||
return x_min, y_min, x_max - x_min, y_max - y_min
|
return x_min, y_min, x_max - x_min, y_max - y_min
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_decorative_margin(
|
||||||
|
words: List[Dict],
|
||||||
|
img_w: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Remove words that belong to a decorative alphabet strip on a margin.
|
||||||
|
|
||||||
|
Some vocabulary worksheets have a vertical A–Z alphabet graphic along
|
||||||
|
the left or right edge. OCR reads each letter as an isolated single-
|
||||||
|
character word. These decorative elements are not content and confuse
|
||||||
|
column/row detection.
|
||||||
|
|
||||||
|
Detection criteria:
|
||||||
|
- Words are in the outer 30% of the page (left or right)
|
||||||
|
- Nearly all words are single characters (letters or digits)
|
||||||
|
- At least 8 such words form a vertical strip (≥8 unique Y positions)
|
||||||
|
- Average horizontal spread of the strip is small (< 60px)
|
||||||
|
|
||||||
|
Modifies *words* in place.
|
||||||
|
"""
|
||||||
|
if not words or img_w <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
margin_cutoff = img_w * 0.30
|
||||||
|
# Candidate margin words: single char, in left or right 30%
|
||||||
|
left_strip = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) == 1
|
||||||
|
and w["left"] + w.get("width", 0) / 2 < margin_cutoff
|
||||||
|
]
|
||||||
|
right_strip = [
|
||||||
|
w for w in words
|
||||||
|
if len((w.get("text") or "").strip()) == 1
|
||||||
|
and w["left"] + w.get("width", 0) / 2 > img_w - margin_cutoff
|
||||||
|
]
|
||||||
|
|
||||||
|
for strip, side in [(left_strip, "left"), (right_strip, "right")]:
|
||||||
|
if len(strip) < 8:
|
||||||
|
continue
|
||||||
|
# Check vertical distribution: should have many distinct Y positions
|
||||||
|
y_centers = sorted(set(
|
||||||
|
int(w["top"] + w.get("height", 0) / 2) // 20 * 20 # bucket
|
||||||
|
for w in strip
|
||||||
|
))
|
||||||
|
if len(y_centers) < 6:
|
||||||
|
continue
|
||||||
|
# Check horizontal compactness
|
||||||
|
x_positions = [w["left"] for w in strip]
|
||||||
|
x_spread = max(x_positions) - min(x_positions)
|
||||||
|
if x_spread > 80:
|
||||||
|
continue
|
||||||
|
# This looks like a decorative alphabet strip — remove these words
|
||||||
|
strip_set = set(id(w) for w in strip)
|
||||||
|
before = len(words)
|
||||||
|
words[:] = [w for w in words if id(w) not in strip_set]
|
||||||
|
removed = before - len(words)
|
||||||
|
if removed:
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: removed %d decorative %s-margin chars",
|
||||||
|
session_id, removed, side,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_footer_words(
|
||||||
|
words: List[Dict],
|
||||||
|
img_h: int,
|
||||||
|
log: Any,
|
||||||
|
session_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Remove isolated words in the bottom 5% of the page (page numbers).
|
||||||
|
|
||||||
|
Modifies *words* in place.
|
||||||
|
"""
|
||||||
|
if not words or img_h <= 0:
|
||||||
|
return
|
||||||
|
footer_y = img_h * 0.95
|
||||||
|
footer_words = [
|
||||||
|
w for w in words
|
||||||
|
if w["top"] + w.get("height", 0) / 2 > footer_y
|
||||||
|
]
|
||||||
|
if not footer_words:
|
||||||
|
return
|
||||||
|
# Only remove if footer has very few words (≤ 3) with short text
|
||||||
|
total_text = "".join((w.get("text") or "").strip() for w in footer_words)
|
||||||
|
if len(footer_words) <= 3 and len(total_text) <= 10:
|
||||||
|
footer_set = set(id(w) for w in footer_words)
|
||||||
|
words[:] = [w for w in words if id(w) not in footer_set]
|
||||||
|
log.info(
|
||||||
|
"build-grid session %s: removed %d footer words ('%s')",
|
||||||
|
session_id, len(footer_words), total_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Endpoints
|
# Endpoints
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -696,7 +780,18 @@ async def build_grid(session_id: str):
|
|||||||
logger.info("build-grid session %s: %d words from %d cells",
|
logger.info("build-grid session %s: %d words from %d cells",
|
||||||
session_id, len(all_words), len(word_result["cells"]))
|
session_id, len(all_words), len(word_result["cells"]))
|
||||||
|
|
||||||
# 2b. Filter words inside detected graphic/image regions
|
# 2b. Filter decorative margin columns (alphabet graphics).
|
||||||
|
# Some worksheets have a decorative alphabet strip along one margin
|
||||||
|
# (A-Z in a graphic). OCR reads these as single-char words aligned
|
||||||
|
# vertically. Detect and remove them before grid building.
|
||||||
|
_filter_decorative_margin(all_words, img_w, logger, session_id)
|
||||||
|
|
||||||
|
# 2c. Filter footer rows (page numbers at the very bottom).
|
||||||
|
# Isolated short text in the bottom 5% of the page is typically a
|
||||||
|
# page number ("64", "S. 12") and not real content.
|
||||||
|
_filter_footer_words(all_words, img_h, logger, session_id)
|
||||||
|
|
||||||
|
# 2d. Filter words inside detected graphic/image regions
|
||||||
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
|
# Only remove LOW-CONFIDENCE words (likely OCR artifacts from images).
|
||||||
# High-confidence words are real text even if they overlap a detected
|
# High-confidence words are real text even if they overlap a detected
|
||||||
# graphic region (e.g. colored text that graphic detection couldn't
|
# graphic region (e.g. colored text that graphic detection couldn't
|
||||||
|
|||||||
Reference in New Issue
Block a user