Preserve = and (= tokens in grid build and cell text cleanup
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m34s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 42s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 43s
CI / test-go-edu-search (push) Successful in 47s
CI / test-python-klausur (push) Failing after 2m34s
CI / test-python-agent-core (push) Successful in 34s
CI / test-nodejs-website (push) Successful in 42s
= signs are used as definition markers in textbooks ("film = 1. Film").
They were incorrectly removed by two filters:
1. grid_build_core.py Step 5j-pre: _PURE_JUNK_RE matched "=" as
artifact noise. Now exempts =, (=, ;, :, - and similar meaningful
punctuation tokens.
2. cv_ocr_engines.py _is_noise_tail_token: "pure non-alpha" check
removed trailing = tokens. Now exempts meaningful punctuation.
Fixes: "film = 1. Film; 2. filmen" losing the = sign,
"(= I won and he lost.)" losing the (=.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1881,6 +1881,11 @@ def _is_noise_tail_token(token: str) -> bool:
|
|||||||
if t.endswith(']'):
|
if t.endswith(']'):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Keep meaningful punctuation tokens used in textbooks
|
||||||
|
# = (definition marker), (= (definition opener), ; (separator)
|
||||||
|
if t in ('=', '(=', '=)', ';', ':', '-', '–', '—', '/', '+', '&'):
|
||||||
|
return False
|
||||||
|
|
||||||
# Pure non-alpha → noise ("3", ")", "|")
|
# Pure non-alpha → noise ("3", ")", "|")
|
||||||
alpha_chars = _RE_ALPHA.findall(t)
|
alpha_chars = _RE_ALPHA.findall(t)
|
||||||
if not alpha_chars:
|
if not alpha_chars:
|
||||||
|
|||||||
@@ -1575,7 +1575,10 @@ async def _build_grid_core(
|
|||||||
if not core:
|
if not core:
|
||||||
is_artifact = True
|
is_artifact = True
|
||||||
elif _PURE_JUNK_RE.match(core):
|
elif _PURE_JUNK_RE.match(core):
|
||||||
is_artifact = True
|
# Keep meaningful punctuation tokens used in textbooks
|
||||||
|
# = (definition), (= (definition), ; : - are valid content
|
||||||
|
if core.strip() not in ('=', '(=', '=)', ';', ':', '-', '–', '—'):
|
||||||
|
is_artifact = True
|
||||||
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
elif len(core) <= 2 and core.lower() not in _COMMON_SHORT_WORDS and not core.isalpha():
|
||||||
# Short non-alphabetic text like "a=", not word beginnings like "Zw"
|
# Short non-alphabetic text like "a=", not word beginnings like "Zw"
|
||||||
is_artifact = True
|
is_artifact = True
|
||||||
|
|||||||
Reference in New Issue
Block a user