fix: merge syllable-split word_boxes + keep dictionary guide words
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s

OCR splits words at syllable marks into overlapping word_boxes (e.g.
"zu" + "tiefst" with 52% x-overlap). Step 5i previously removed the
lower-confidence box, losing the prefix. Now: when both boxes are
alphabetic text with 20-75% overlap, MERGE them into one word_box
("zutiefst") instead of removing.

Also relaxed artifact cell filter: 2-char alphabetic text like "Zw"
(dictionary guide word) is no longer removed. Only non-alphabetic
short text like "a=" is filtered.

Results for session 5997: "tiefst"→"zutiefst", "zu"→"zuständig",
"Zu die Zuschüsse"→"Zuschuss, die Zuschüsse", "Zw" restored.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-22 08:21:00 +01:00
parent 882b177fc3
commit 7b3319be2e

View File

@@ -2431,6 +2431,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
# Rule (b) + (c): overlap and duplicate detection
# Sort by x for pairwise comparison
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
for p in range(len(indexed) - 1):
i1, w1 = indexed[p]
@@ -2442,19 +2444,33 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
gap = x2s - x1e
overlap_pct = overlap / min_w if min_w > 0 else 0
# (b) Significant x-overlap: remove the lower-confidence one
if overlap_pct > 0.40:
# (b) Significant x-overlap
if overlap_pct > 0.20:
t1 = (w1.get("text") or "").strip()
t2 = (w2.get("text") or "").strip()
# Syllable-split words: both are alphabetic text with
# moderate overlap (20-75%). Merge instead of removing.
# OCR splits words at syllable marks, producing overlapping
# boxes like "zu" + "tiefst" → "zutiefst".
if (overlap_pct <= 0.75
and _ALPHA_WORD_RE.match(t1)
and _ALPHA_WORD_RE.match(t2)):
to_merge.append((i1, i2))
continue
if overlap_pct <= 0.40:
continue # too little overlap and not alphabetic merge
c1 = w1.get("conf", 50)
c2 = w2.get("conf", 50)
t1 = (w1.get("text") or "").strip().lower()
t2 = (w2.get("text") or "").strip().lower()
# For very high overlap (>90%) with different text,
# prefer the word that exists in the IPA dictionary
# over confidence (OCR can give artifacts high conf).
if overlap_pct > 0.90 and t1 != t2:
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False
if overlap_pct > 0.90 and t1.lower() != t2.lower():
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
if in_dict_1 and not in_dict_2:
to_remove.add(i2)
continue
@@ -2483,6 +2499,37 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
c2 = w2.get("conf", 50)
to_remove.add(i1 if c1 <= c2 else i2)
# Execute merges first (syllable-split words)
if to_merge:
merged_indices: set = set()
for mi1, mi2 in to_merge:
if mi1 in to_remove or mi2 in to_remove:
continue # don't merge if one is being removed
if mi1 in merged_indices or mi2 in merged_indices:
continue # already merged
mw1, mw2 = wbs[mi1], wbs[mi2]
# Concatenate text (no space — they're parts of one word)
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
mt2 = (mw2.get("text") or "").strip()
merged_text = mt1 + mt2
# Union bounding box
mx = min(mw1["left"], mw2["left"])
my = min(mw1["top"], mw2["top"])
mr = max(mw1["left"] + mw1["width"],
mw2["left"] + mw2["width"])
mb = max(mw1["top"] + mw1["height"],
mw2["top"] + mw2["height"])
mw1["text"] = merged_text
mw1["left"] = mx
mw1["top"] = my
mw1["width"] = mr - mx
mw1["height"] = mb - my
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
to_remove.add(mi2) # remove the second one
merged_indices.add(mi1)
merged_indices.add(mi2)
bullet_removed -= 1 # net: merge, not removal
if to_remove:
bullet_removed += len(to_remove)
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
@@ -2525,7 +2572,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
is_artifact = True
elif _PURE_JUNK_RE.match(core):
is_artifact = True
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
# Short non-alphabetic text like "a=", not word beginnings like "Zw"
is_artifact = True
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
is_artifact = True