Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s
When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104),
it heavily overlaps (>75%) with the next fragment ("brech"). The grid
builder's overlap filter previously removed the prefix as a duplicate.
Fix: when overlap > 75% but both boxes are alphabetic with different
text and one is ≤ 4 chars, merge instead of removing. Also enable
chain merging via merge_parent tracking so "zer" + "brech" + "lich"
→ "zerbrechlich" in a single pass.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1353,6 +1353,19 @@ async def _build_grid_core(
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
# High overlap (>75%) with different alphabetic text:
|
||||
# OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
|
||||
# causing it to heavily overlap with the next fragment ("brech").
|
||||
# Merge instead of removing when one is a short prefix (≤4 chars)
|
||||
# and the texts are different.
|
||||
if (overlap_pct > 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)
|
||||
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
|
||||
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue # too little overlap and not alphabetic merge
|
||||
|
||||
@@ -1393,15 +1406,22 @@ async def _build_grid_core(
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
# Execute merges first (syllable-split words).
|
||||
# Use merge_parent to support chain merging: if "zer" absorbed
|
||||
# "brech" and then "brech"+"lich" is a merge pair, redirect to
|
||||
# merge "lich" into "zer" → "zerbrechlich".
|
||||
if to_merge:
|
||||
merged_indices: set = set()
|
||||
merge_parent: Dict[int, int] = {} # absorbed → absorber
|
||||
for mi1, mi2 in to_merge:
|
||||
if mi1 in to_remove or mi2 in to_remove:
|
||||
continue # don't merge if one is being removed
|
||||
if mi1 in merged_indices or mi2 in merged_indices:
|
||||
continue # already merged
|
||||
mw1, mw2 = wbs[mi1], wbs[mi2]
|
||||
# Follow chain: if mi1 was absorbed, find root absorber
|
||||
actual_mi1 = mi1
|
||||
while actual_mi1 in merge_parent:
|
||||
actual_mi1 = merge_parent[actual_mi1]
|
||||
if actual_mi1 in to_remove or mi2 in to_remove:
|
||||
continue
|
||||
if mi2 in merge_parent:
|
||||
continue # mi2 already absorbed
|
||||
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
|
||||
# Concatenate text (no space — they're parts of one word)
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
@@ -1419,9 +1439,8 @@ async def _build_grid_core(
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2) # remove the second one
|
||||
merged_indices.add(mi1)
|
||||
merged_indices.add(mi2)
|
||||
to_remove.add(mi2)
|
||||
merge_parent[mi2] = actual_mi1
|
||||
bullet_removed -= 1 # net: merge, not removal
|
||||
|
||||
if to_remove:
|
||||
|
||||
Reference in New Issue
Block a user