From 5e78c00eee93715d4b93dd6584ee122fc76880fd Mon Sep 17 00:00:00 2001
From: Tatsunori Uchino <tats.u@live.jp>
Date: Sun, 15 Dec 2024 16:25:15 +0900
Subject: [PATCH 1/4] Recognize non-BMP punctuations & symbols

---
 lib/rules_inline/state_inline.mjs             | 22 ++++++++++++++-----
 .../markdown-it/commonmark_extras.txt         | 22 +++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)
diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs
index 80cb3c5..ddce556 100644
--- a/lib/rules_inline/state_inline.mjs
+++ b/lib/rules_inline/state_inline.mjs
@@ -89,8 +89,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
   const max = this.posMax
   const marker = this.src.charCodeAt(start)
 
-  // treat beginning of the line as a whitespace
-  const lastChar = start > 0 ? this.src.charCodeAt(start - 1) : 0x20
+  const lastChar = getLastCharCode(this.src, start)
 
   let pos = start
   while (pos < max && this.src.charCodeAt(pos) === marker) { pos++ }
@@ -98,10 +97,10 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
   const count = pos - start
 
   // treat end of the line as a whitespace
-  const nextChar = pos < max ? this.src.charCodeAt(pos) : 0x20
+  const nextChar = pos < max ? this.src.codePointAt(pos) : 0x20
 
-  const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCharCode(lastChar))
-  const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCharCode(nextChar))
+  const isLastPunctChar = isMdAsciiPunct(lastChar) || isPunctChar(String.fromCodePoint(lastChar))
+  const isNextPunctChar = isMdAsciiPunct(nextChar) || isPunctChar(String.fromCodePoint(nextChar))
 
   const isLastWhiteSpace = isWhiteSpace(lastChar)
   const isNextWhiteSpace = isWhiteSpace(nextChar)
@@ -115,6 +114,19 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
   const can_close = right_flanking && (canSplitWord || !left_flanking  || isNextPunctChar)
 
   return { can_open, can_close, length: count }
+
+  function getLastCharCode (str, pos) {
+    // treat beginning of the line as a whitespace
+    if (pos <= 0) { return 0x20 }
+    const charCode = str.codePointAt(pos - 1)
+    // not low surrogates (BMP)
+    if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
+
+    // undefined if out of range (leading stray low surrogates)
+    const codePoint = str.codePointAt(pos - 2)
+    // undefined > 0xffff = false, so we don't need extra check here
+    return codePoint > 0xffff ? codePoint : charCode
+  }
 }
 
 // re-export Token class to use in block rules
diff --git a/test/fixtures/markdown-it/commonmark_extras.txt b/test/fixtures/markdown-it/commonmark_extras.txt
index 558c011..5e24726 100644
--- a/test/fixtures/markdown-it/commonmark_extras.txt
+++ b/test/fixtures/markdown-it/commonmark_extras.txt
@@ -740,3 +740,25 @@ Html in image description
 .
 <p><img src="image.png" alt="text &lt;textarea&gt; text"></p>
 .
+
+Issue #1071. Recognize non-BMP punctuations and symbols
+.
+a*a∇*a
+
+a*∇a*a
+
+a*a𝜵*a
+
+a*𝜵a*a
+
+a*𐬼a*a
+
+a*a𐬼*a
+.
+<p>a*a∇*a</p>
+<p>a*∇a*a</p>
+<p>a*a𝜵*a</p>
+<p>a*𝜵a*a</p>
+<p>a*𐬼a*a</p>
+<p>a*a𐬼*a</p>
+.

From 7a6d58af5f3d153a5310e4b785b6e27ca7dbe354 Mon Sep 17 00:00:00 2001
From: Tatsunori Uchino <tats.u@live.jp>
Date: Fri, 20 Dec 2024 00:02:32 +0900
Subject: [PATCH 2/4] Add comment

---
 lib/rules_inline/state_inline.mjs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs
index ddce556..225837b 100644
--- a/lib/rules_inline/state_inline.mjs
+++ b/lib/rules_inline/state_inline.mjs
@@ -120,6 +120,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
     if (pos <= 0) { return 0x20 }
     const charCode = str.codePointAt(pos - 1)
     // not low surrogates (BMP)
+    // undefined & 0xFC00 = 0
     if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
 
     // undefined if out of range (leading stray low surrogates)

From a091ed9c9b17a26319c0afeb86c6b80b18a45ed4 Mon Sep 17 00:00:00 2001
From: Tatsunori Uchino <tats.u@live.jp>
Date: Fri, 20 Dec 2024 00:05:50 +0900
Subject: [PATCH 3/4] Fix comment

---
 lib/rules_inline/state_inline.mjs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs
index 225837b..d1c00a6 100644
--- a/lib/rules_inline/state_inline.mjs
+++ b/lib/rules_inline/state_inline.mjs
@@ -120,7 +120,7 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
     if (pos <= 0) { return 0x20 }
     const charCode = str.codePointAt(pos - 1)
     // not low surrogates (BMP)
-    // undefined & 0xFC00 = 0
+    // undefined & 0xFC00 = 0, but never happens thanks to the first if
     if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
 
     // undefined if out of range (leading stray low surrogates)

From 1321d2eaa066fb4277c2e12c9b9b2de3637e2492 Mon Sep 17 00:00:00 2001
From: Tatsunori Uchino <tats.u@live.jp>
Date: Fri, 20 Dec 2024 00:09:39 +0900
Subject: [PATCH 4/4] codePointAt is excessive

---
 lib/rules_inline/state_inline.mjs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/rules_inline/state_inline.mjs b/lib/rules_inline/state_inline.mjs
index d1c00a6..8d33819 100644
--- a/lib/rules_inline/state_inline.mjs
+++ b/lib/rules_inline/state_inline.mjs
@@ -118,9 +118,8 @@ StateInline.prototype.scanDelims = function (start, canSplitWord) {
   function getLastCharCode (str, pos) {
     // treat beginning of the line as a whitespace
     if (pos <= 0) { return 0x20 }
-    const charCode = str.codePointAt(pos - 1)
+    const charCode = str.charCodeAt(pos - 1)
     // not low surrogates (BMP)
-    // undefined & 0xFC00 = 0, but never happens thanks to the first if
     if ((charCode & 0xFC00) !== 0xDC00) { return charCode }
 
     // undefined if out of range (leading stray low surrogates)