From 5e78c00eee93715d4b93dd6584ee122fc76880fd Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Sun, 15 Dec 2024 16:25:15 +0900 Subject: [PATCH 1/4] Recognize non-BMP punctuations & symbols --- lib/rules_inline/state_inline.mjs | 22 ++++++++++++++----- .../markdown-it/commonmark_extras.txt | 22 +++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs index 80cb3c5..ddce556 100644 --- a/lib/rules_inline/state_inline.mjs +++ b/lib/rules_inline/state_inline.mjs @@ -89,8 +89,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { const max = this.posMax const marker = this.src.charCodeAt(start) - // treat beginning of the line as a whitespace - const lastChar = start > 0 ? this.src.charCodeAt(start - 1) : 0x20 + const lastChar = getLastCharCode(this.src, start) let pos = start while (pos < max && this.src.charCodeAt(pos) === marker) { pos++ } @@ -98,10 +97,10 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { const count = pos - start // treat end of the line as a whitespace - const nextChar = pos < max ? this.src.charCodeAt(pos) : 0x20 + const nextChar = pos < max ? this.src.codePointAt(pos) : 0x20 - const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar)) - const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar)) + const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCodePoint(lastChar)) + const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCodePoint(nextChar)) const isLastWhiteSpace = isWhiteSpace(lastChar) const isNextWhiteSpace = isWhiteSpace(nextChar) @@ -115,6 +114,19 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { const can_close = right_flanking && (canSplitWord || !left_flanking || isNextPunctChar) return { can_open, can_close, length: count } + + function getLastCharCode (str, pos) { + // treat beginning of the line as a whitespace + if (pos <= 0) { return 0x20 } + const charCode = str.codePointAt(pos - 1) + // not low surrogates (BMP) + if ((charCode & 0xFC00) !== 0xDC00) { return charCode } + + // undefined if out of range (leading stray low surrogates) + const codePoint = str.codePointAt(pos - 2) + // undefined > 0xffff = false, so we don't need extra check here + return codePoint > 0xffff ? codePoint : charCode + } } // re-export Token class to use in block rules diff --git a/test/fixtures/markdown-it/commonmark_extras.txt b/test/fixtures/markdown-it/commonmark_extras.txt index 558c011..5e24726 100644 --- a/test/fixtures/markdown-it/commonmark_extras.txt +++ b/test/fixtures/markdown-it/commonmark_extras.txt @@ -740,3 +740,25 @@ Html in image description .

text <textarea> text

. + +Issue #1071. Recognize non-BMP punctuations and symbols +. +a*a∇*a + +a*∇a*a + +a*a𝜵*a + +a*𝜵a*a + +a*𐬼a*a + +a*a𐬼*a +. +

a*a∇*a

+

a*∇a*a

+

a*a𝜵*a

+

a*𝜵a*a

+

a*𐬼a*a

+

a*a𐬼*a

+. From 7a6d58af5f3d153a5310e4b785b6e27ca7dbe354 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 20 Dec 2024 00:02:32 +0900 Subject: [PATCH 2/4] Add comment --- lib/rules_inline/state_inline.mjs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs index ddce556..225837b 100644 --- a/lib/rules_inline/state_inline.mjs +++ b/lib/rules_inline/state_inline.mjs @@ -120,6 +120,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { if (pos <= 0) { return 0x20 } const charCode = str.codePointAt(pos - 1) // not low surrogates (BMP) + // undefined & 0xFC00 = 0 if ((charCode & 0xFC00) !== 0xDC00) { return charCode } // undefined if out of range (leading stray low surrogates) From a091ed9c9b17a26319c0afeb86c6b80b18a45ed4 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 20 Dec 2024 00:05:50 +0900 Subject: [PATCH 3/4] Fix comment --- lib/rules_inline/state_inline.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs index 225837b..d1c00a6 100644 --- a/lib/rules_inline/state_inline.mjs +++ b/lib/rules_inline/state_inline.mjs @@ -120,7 +120,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { if (pos <= 0) { return 0x20 } const charCode = str.codePointAt(pos - 1) // not low surrogates (BMP) - // undefined & 0xFC00 = 0 + // undefined & 0xFC00 = 0, but never happens thanks to the first if if ((charCode & 0xFC00) !== 0xDC00) { return charCode } // undefined if out of range (leading stray low surrogates) From 1321d2eaa066fb4277c2e12c9b9b2de3637e2492 Mon Sep 17 00:00:00 2001 From: Tatsunori Uchino Date: Fri, 20 Dec 2024 00:09:39 +0900 Subject: [PATCH 4/4] codePointAt is excessive --- lib/rules_inline/state_inline.mjs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs index d1c00a6..8d33819 100644 --- a/lib/rules_inline/state_inline.mjs +++ b/lib/rules_inline/state_inline.mjs @@ -118,9 +118,8 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) { function getLastCharCode (str, pos) { // treat beginning of the line as a whitespace if (pos <= 0) { return 0x20 } - const charCode = str.codePointAt(pos - 1) + const charCode = str.charCodeAt(pos - 1) // not low surrogates (BMP) - // undefined & 0xFC00 = 0, but never happens thanks to the first if if ((charCode & 0xFC00) !== 0xDC00) { return charCode } // undefined if out of range (leading stray low surrogates)