Remove scattered debris rows and disable spanning header detection
- Add Rule 3 to junk-row filter: rows where no word is longer than 2 chars are removed as scattered OCR debris from illustrations - Fully disable spanning-header detection which falsely flagged IPA transcriptions and vocabulary entries as spanning headers - First-row heuristic remains for genuine header detection Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -452,30 +452,11 @@ def _detect_header_rows(
|
|||||||
if 0 not in headers:
|
if 0 not in headers:
|
||||||
headers.append(0)
|
headers.append(0)
|
||||||
|
|
||||||
# Spanning header detection: rows with very few words that span
|
# Note: Spanning-header detection (rows spanning all columns) has been
|
||||||
# across many columns (e.g. "Unit 4: Bonnie Scotland" centred
|
# disabled because it produces too many false positives on vocabulary
|
||||||
# across all columns). Only trigger for clear cases (≥3 cols,
|
# worksheets where IPA transcriptions or short entries naturally span
|
||||||
# ≤3 words) to avoid false positives on vocabulary worksheets
|
# multiple columns with few words. The first-row heuristic above is
|
||||||
# where colored entries naturally span 2 columns.
|
# sufficient for detecting real headers.
|
||||||
if columns and len(columns) >= 3:
|
|
||||||
for row in rows:
|
|
||||||
ri = row["index"]
|
|
||||||
if ri in headers:
|
|
||||||
continue
|
|
||||||
row_words = [
|
|
||||||
w for w in zone_words
|
|
||||||
if row["y_min"] <= w["top"] + w["height"] / 2 <= row["y_max"]
|
|
||||||
]
|
|
||||||
if not row_words or len(row_words) > 3:
|
|
||||||
continue
|
|
||||||
word_x_min = min(w["left"] for w in row_words)
|
|
||||||
word_x_max = max(w["left"] + w["width"] for w in row_words)
|
|
||||||
cols_spanned = sum(
|
|
||||||
1 for c in columns
|
|
||||||
if word_x_min < c["x_max"] and word_x_max > c["x_min"]
|
|
||||||
)
|
|
||||||
if cols_spanned >= 3 and len(row_words) <= 3:
|
|
||||||
headers.append(ri)
|
|
||||||
|
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
@@ -1124,6 +1105,14 @@ async def build_grid(session_id: str):
|
|||||||
junk_row_indices.add(ri)
|
junk_row_indices.add(ri)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Rule 3: scattered debris — rows with only tiny fragments
|
||||||
|
# (e.g. OCR artifacts from illustrations/graphics).
|
||||||
|
# If the row has no word longer than 2 chars, it's noise.
|
||||||
|
longest = max(len((wb.get("text") or "").strip()) for wb in row_wbs)
|
||||||
|
if longest <= 2:
|
||||||
|
junk_row_indices.add(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
if junk_row_indices:
|
if junk_row_indices:
|
||||||
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
z["cells"] = [c for c in cells if c.get("row_index") not in junk_row_indices]
|
||||||
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
z["rows"] = [r for r in rows if r["index"] not in junk_row_indices]
|
||||||
|
|||||||
Reference in New Issue
Block a user