fix: merge syllable-split word_boxes + keep dictionary guide words
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 27s
CI / test-go-edu-search (push) Successful in 28s
CI / test-python-klausur (push) Failing after 1m56s
CI / test-python-agent-core (push) Successful in 16s
CI / test-nodejs-website (push) Successful in 17s
OCR splits words at syllable marks into overlapping word_boxes (e.g.
"zu" + "tiefst" with 52% x-overlap). Step 5i previously removed the
lower-confidence box, losing the prefix. Now: when both boxes are
alphabetic text with 20-75% overlap, MERGE them into one word_box
("zutiefst") instead of removing.
Also relaxed artifact cell filter: 2-char alphabetic text like "Zw"
(dictionary guide word) is no longer removed. Only non-alphabetic
short text like "a=" is filtered.
Results for session 5997: "tiefst"→"zutiefst", "zu"→"zuständig",
"Zu die Zuschüsse"→"Zuschuss, die Zuschüsse", "Zw" restored.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2431,6 +2431,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
|
||||
# Rule (b) + (c): overlap and duplicate detection
|
||||
# Sort by x for pairwise comparison
|
||||
_ALPHA_WORD_RE = re.compile(r'^[A-Za-z\u00c0-\u024f\-]+[.,;:!?]*$')
|
||||
to_merge: List[Tuple[int, int]] = [] # pairs (i1, i2) to merge
|
||||
indexed = sorted(enumerate(wbs), key=lambda iw: iw[1].get("left", 0))
|
||||
for p in range(len(indexed) - 1):
|
||||
i1, w1 = indexed[p]
|
||||
@@ -2442,19 +2444,33 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
gap = x2s - x1e
|
||||
overlap_pct = overlap / min_w if min_w > 0 else 0
|
||||
|
||||
# (b) Significant x-overlap: remove the lower-confidence one
|
||||
if overlap_pct > 0.40:
|
||||
# (b) Significant x-overlap
|
||||
if overlap_pct > 0.20:
|
||||
t1 = (w1.get("text") or "").strip()
|
||||
t2 = (w2.get("text") or "").strip()
|
||||
|
||||
# Syllable-split words: both are alphabetic text with
|
||||
# moderate overlap (20-75%). Merge instead of removing.
|
||||
# OCR splits words at syllable marks, producing overlapping
|
||||
# boxes like "zu" + "tiefst" → "zutiefst".
|
||||
if (overlap_pct <= 0.75
|
||||
and _ALPHA_WORD_RE.match(t1)
|
||||
and _ALPHA_WORD_RE.match(t2)):
|
||||
to_merge.append((i1, i2))
|
||||
continue
|
||||
|
||||
if overlap_pct <= 0.40:
|
||||
continue # too little overlap and not alphabetic merge
|
||||
|
||||
c1 = w1.get("conf", 50)
|
||||
c2 = w2.get("conf", 50)
|
||||
t1 = (w1.get("text") or "").strip().lower()
|
||||
t2 = (w2.get("text") or "").strip().lower()
|
||||
|
||||
# For very high overlap (>90%) with different text,
|
||||
# prefer the word that exists in the IPA dictionary
|
||||
# over confidence (OCR can give artifacts high conf).
|
||||
if overlap_pct > 0.90 and t1 != t2:
|
||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1), "british")) if t1.isalpha() else False
|
||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2), "british")) if t2.isalpha() else False
|
||||
if overlap_pct > 0.90 and t1.lower() != t2.lower():
|
||||
in_dict_1 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t1.lower()), "british")) if t1.isalpha() else False
|
||||
in_dict_2 = bool(_lookup_ipa(re.sub(r'[²³¹\d/]', '', t2.lower()), "british")) if t2.isalpha() else False
|
||||
if in_dict_1 and not in_dict_2:
|
||||
to_remove.add(i2)
|
||||
continue
|
||||
@@ -2483,6 +2499,37 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
c2 = w2.get("conf", 50)
|
||||
to_remove.add(i1 if c1 <= c2 else i2)
|
||||
|
||||
# Execute merges first (syllable-split words)
|
||||
if to_merge:
|
||||
merged_indices: set = set()
|
||||
for mi1, mi2 in to_merge:
|
||||
if mi1 in to_remove or mi2 in to_remove:
|
||||
continue # don't merge if one is being removed
|
||||
if mi1 in merged_indices or mi2 in merged_indices:
|
||||
continue # already merged
|
||||
mw1, mw2 = wbs[mi1], wbs[mi2]
|
||||
# Concatenate text (no space — they're parts of one word)
|
||||
mt1 = (mw1.get("text") or "").rstrip(".,;:!?")
|
||||
mt2 = (mw2.get("text") or "").strip()
|
||||
merged_text = mt1 + mt2
|
||||
# Union bounding box
|
||||
mx = min(mw1["left"], mw2["left"])
|
||||
my = min(mw1["top"], mw2["top"])
|
||||
mr = max(mw1["left"] + mw1["width"],
|
||||
mw2["left"] + mw2["width"])
|
||||
mb = max(mw1["top"] + mw1["height"],
|
||||
mw2["top"] + mw2["height"])
|
||||
mw1["text"] = merged_text
|
||||
mw1["left"] = mx
|
||||
mw1["top"] = my
|
||||
mw1["width"] = mr - mx
|
||||
mw1["height"] = mb - my
|
||||
mw1["conf"] = (mw1.get("conf", 50) + mw2.get("conf", 50)) / 2
|
||||
to_remove.add(mi2) # remove the second one
|
||||
merged_indices.add(mi1)
|
||||
merged_indices.add(mi2)
|
||||
bullet_removed -= 1 # net: merge, not removal
|
||||
|
||||
if to_remove:
|
||||
bullet_removed += len(to_remove)
|
||||
filtered = [wb for i, wb in enumerate(wbs) if i not in to_remove]
|
||||
@@ -2525,7 +2572,8 @@ async def _build_grid_core(session_id: str, session: dict) -> dict:
|
||||
is_artifact = True
|
||||
elif _PURE_JUNK_RE.match(core):
|
||||
is_artifact = True
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||
# Short non-alphabetic text like "a=", not word beginnings like "Zw"
|
||||
is_artifact = True
|
||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||
is_artifact = True
|
||||
|
||||
Reference in New Issue
Block a user