From c4d87f9ad19fea250e89da9f6341864613aa9e1d Mon Sep 17 00:00:00 2001 From: Jason Ojisan Date: Sun, 13 Oct 2024 16:17:21 +0900 Subject: [PATCH] Update to ZH normalizer regex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update chinese-reading-normalizer.test.js 3 symbols added: ' ’ - Signed-off-by: shiki-tm <36088384+shiki-tm@users.noreply.github.com> * Update chinese.js 3 symbols added: '’- Signed-off-by: shiki-tm <36088384+shiki-tm@users.noreply.github.com> * Update regex pattern for chinese.js Signed-off-by: shiki-tm <36088384+shiki-tm@users.noreply.github.com> * revert change * readd separators, escape quote --------- Signed-off-by: shiki-tm <36088384+shiki-tm@users.noreply.github.com> Co-authored-by: Cashew ZjI1Nzc1Y2Q2OTQ2ODUwOTE4ODZiZTYzM2EwNTkwOTE3NmI5MWI2NAo= --- ext/js/language/zh/chinese.js | 2 +- test/language/chinese-reading-normalizer.test.js | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js index c024cad3bd..a824cb7f51 100644 --- a/ext/js/language/zh/chinese.js +++ b/ext/js/language/zh/chinese.js @@ -71,5 +71,5 @@ export function isCodePointChinese(codePoint) { /** @type {import('language').ReadingNormalizer} */ export function normalizePinyin(str) { - return str.normalize('NFC').toLowerCase().replace(/[\s・:]|\/\//g, ''); + return str.normalize('NFC').toLowerCase().replace(/[\s・:'’-]|\/\//g, ''); } diff --git a/test/language/chinese-reading-normalizer.test.js b/test/language/chinese-reading-normalizer.test.js index 151f58c194..ad6f9a88c4 100644 --- a/test/language/chinese-reading-normalizer.test.js +++ b/test/language/chinese-reading-normalizer.test.js @@ -25,6 +25,9 @@ const tests = [ ['wán:zhěng', 'wánzhěng'], ['fān・yì', 'fānyì'], ['fān//yì', 'fānyì'], + ['fān’yì', 'fānyì'], + ['fān\'yì', 'fānyì'], + ['fān-yì', 'fānyì'], ]; describe('Normalize Pinyin', () => {