Fix page refs deleted as artifacts + IPA spacing for DE mode
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 41s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has started running
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 54s
CI / test-go-edu-search (push) Successful in 41s
CI / test-nodejs-website (push) Has been cancelled
CI / test-python-agent-core (push) Has been cancelled
CI / test-python-klausur (push) Has started running
1. Step 5j-pre wrongly classified "p.43", "p.50" etc as artifacts (mixed digits+letters, <=5 chars). Added exception for page reference patterns (p.XX, S.XX). 2. IPA spacing regex was too narrow (only matched Unicode IPA chars). Now matches any [bracket] content >=2 chars directly after a letter, fixing German IPA like "Opa[oːpa]" → "Opa [oːpa]". Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1507,8 +1507,10 @@ async def _build_grid_core(
|
|||||||
is_artifact = True
|
is_artifact = True
|
||||||
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
elif len(core) <= 3 and core.isupper() and core.lower() not in _COMMON_SHORT_WORDS:
|
||||||
is_artifact = True
|
is_artifact = True
|
||||||
elif len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core):
|
elif (len(core) <= 5 and re.search(r'\d', core) and re.search(r'[A-Za-z]', core)
|
||||||
|
and not re.match(r'^[pPsS]\.?\d+$', core)):
|
||||||
# Mixed digits + letters in short text (e.g. "7 EN", "a=3")
|
# Mixed digits + letters in short text (e.g. "7 EN", "a=3")
|
||||||
|
# but NOT page references like "p.43", "p50", "S.12"
|
||||||
is_artifact = True
|
is_artifact = True
|
||||||
if is_artifact:
|
if is_artifact:
|
||||||
kept.append(None) # placeholder
|
kept.append(None) # placeholder
|
||||||
@@ -1717,8 +1719,10 @@ async def _build_grid_core(
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# --- Ensure space before IPA brackets: "word[ipa]" → "word [ipa]" ---
|
# --- Ensure space before IPA/phonetic brackets: "word[ipa]" → "word [ipa]" ---
|
||||||
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]*[ˈˌːɑɒæɛəɜɪɔʊʌðŋθʃʒɹɡɾ][^\]]*\])')
|
# Matches any [bracket] directly after a letter, as long as the bracket
|
||||||
|
# content doesn't look like a normal text annotation (e.g. "[adj]", "[noun]").
|
||||||
|
_IPA_NOSPACE_RE = re.compile(r'([a-zA-ZäöüÄÖÜß])(\[[^\]]{2,}\])')
|
||||||
for z in zones_data:
|
for z in zones_data:
|
||||||
for cell in z.get("cells", []):
|
for cell in z.get("cells", []):
|
||||||
text = cell.get("text", "")
|
text = cell.get("text", "")
|
||||||
|
|||||||
Reference in New Issue
Block a user