Fix Bug 3: recover OCR-lost prefixes via overlap merge + chain merging
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 29s
CI / test-go-edu-search (push) Successful in 27s
CI / test-python-klausur (push) Failing after 2m24s
CI / test-python-agent-core (push) Successful in 17s
CI / test-nodejs-website (push) Successful in 19s

When OCR merge expands a prefix word box (e.g. "zer" w=42 → w=104),
it heavily overlaps (>75%) with the next fragment ("brech"). The grid
builder's overlap filter previously removed the prefix as a duplicate.

Fix: when overlap > 75% but both boxes are alphabetic with different
text and one is ≤ 4 chars, merge instead of removing. Also enable
chain merging via merge_parent tracking so "zer" + "brech" + "lich"
→ "zerbrechlich" in a single pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-27 15:49:52 +01:00
parent 96ea23164d
commit 0685fb12da

View File

@@ -1353,6 +1353,19 @@ async def _build_grid_core(
to_merge.append((i1, i2)) to_merge.append((i1, i2))
continue continue
# High overlap (>75%) with different alphabetic text:
# OCR merge can expand a prefix box (e.g. "zer" w=42 → w=104)
# causing it to heavily overlap with the next fragment ("brech").
# Merge instead of removing when one is a short prefix (≤4 chars)
# and the texts are different.
if (overlap_pct > 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)
and t1.rstrip(".,;:!?").lower() != t2.rstrip(".,;:!?").lower()
and min(len(t1.rstrip(".,;:!?")), len(t2.rstrip(".,;:!?"))) <= 4):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40: if overlap_pct <= 0.40:
continue # too little overlap and not alphabetic merge continue # too little overlap and not alphabetic merge
@@ -1393,15 +1406,22 @@ async def _build_grid_core(
c2 = w2.get("conf", 50) c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2) to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words) # Execute merges first (syllable-split words).
# Use merge_parent to support chain merging: if "zer" absorbed
# "brech" and then "brech"+"lich" is a merge pair, redirect to
# merge "lich" into "zer" → "zerbrechlich".
if to_merge: if to_merge:
merged_indices: set = set() merge_parent: Dict[int, int] = {} # absorbed → absorber
for mi1, mi2 in to_merge: for mi1, mi2 in to_merge:
if mi1 in to_remove or mi2 in to_remove: # Follow chain: if mi1 was absorbed, find root absorber
continue # don't merge if one is being removed actual_mi1 = mi1
if mi1 in merged_indices or mi2 in merged_indices: while actual_mi1 in merge_parent:
continue # already merged actual_mi1 = merge_parent[actual_mi1]
mw1, mw2 = wbs[mi1], wbs[mi2] if actual_mi1 in to_remove or mi2 in to_remove:
continue
if mi2 in merge_parent:
continue # mi2 already absorbed
mw1, mw2 = wbs[actual_mi1], wbs[mi2]
# Concatenate text (no space — they're parts of one word) # Concatenate text (no space — they're parts of one word)
mt1 = (mw1.get("text") or "").rstrip(".,;:!?") mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip() mt2 = (mw2.get("text") or "").strip()
@@ -1419,9 +1439,8 @@ async def _build_grid_core(
mw1["width"] = mr - mx mw1["width"] = mr - mx
mw1["height"] = mb - my mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2 mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2) # remove the second one to_remove.add(mi2)
merged_indices.add(mi1) merge_parent[mi2] = actual_mi1
merged_indices.add(mi2)
bullet_removed -= 1 # net: merge, not removal bullet_removed -= 1 # net: merge, not removal
if to_remove: if to_remove: