Put escape sequences into separate token

3 years ago · 75037c6514
11 changed files with 162 additions and 31 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+
+## [13.0.0] - WIP
+### Added
+- Added a new token type `text_special` to store escaped characters, same as `text` but
+  unaffected by replacement plugins (smartquotes, typographer, linkifier, etc.).
+- Added a new rule `text_join` in `core` ruler. Text replacement plugins may choose to
+  insert themselves before it.
+
+### Changed
+- `text_collapse` rule is renamed to `fragments_join`.
+
+### Fixed
+- Smartquotes, typographic replacements and plain text links can now be escaped
+  with backslash (e.g. `\(c)` or `google\.com` are no longer replaced).
+
+
 ## [12.3.2] - 2022-01-08
 ### Security
 - Fix possible ReDOS in newline rule. Thanks to @MakeNowJust.
@ -592,6 +608,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Renamed presets folder (configs -> presets).


+[13.0.0]: https://github.com/markdown-it/markdown-it/compare/12.3.2...13.0.0
 [12.3.2]: https://github.com/markdown-it/markdown-it/compare/12.3.1...12.3.2
 [12.3.1]: https://github.com/markdown-it/markdown-it/compare/12.3.0...12.3.1
 [12.3.0]: https://github.com/markdown-it/markdown-it/compare/12.2.0...12.3.0
--- a/lib/parser_core.js
+++ b/lib/parser_core.js
@ -16,7 +16,10 @@ var _rules = [
  [ 'inline',         require('./rules_core/inline')         ],
  [ 'linkify',        require('./rules_core/linkify')        ],
  [ 'replacements',   require('./rules_core/replacements')   ],
-  [ 'smartquotes',    require('./rules_core/smartquotes')    ]
+  [ 'smartquotes',    require('./rules_core/smartquotes')    ],
+  // `text_join` finds `text_special` tokens (for escape sequences)
+  // and joins them with the rest of the text
+  [ 'text_join',      require('./rules_core/text_join')      ]
 ];


--- a/lib/parser_inline.js
+++ b/lib/parser_inline.js
@ -26,11 +26,18 @@ var _rules = [
  [ 'entity',          require('./rules_inline/entity') ]
 ];

+// `rule2` ruleset was created specifically for emphasis/strikethrough
+// post-processing and may be changed in the future.
+//
+// Don't use this for anything except pairs (plugins working with `balance_pairs`).
+//
 var _rules2 = [
  [ 'balance_pairs',   require('./rules_inline/balance_pairs') ],
  [ 'strikethrough',   require('./rules_inline/strikethrough').postProcess ],
  [ 'emphasis',        require('./rules_inline/emphasis').postProcess ],
-  [ 'text_collapse',   require('./rules_inline/text_collapse') ]
+  // rules for pairs separate '**' into its own text tokens, which may be left unused,
+  // rule below merges unused segments back with the rest of the text
+  [ 'fragments_join',  require('./rules_inline/fragments_join') ]
 ];


--- a/lib/presets/commonmark.js
+++ b/lib/presets/commonmark.js
@ -38,7 +38,8 @@ module.exports = {
      rules: [
        'normalize',
        'block',
-        'inline'
+        'inline',
+        'text_join'
      ]
    },

@ -73,7 +74,7 @@ module.exports = {
      rules2: [
        'balance_pairs',
        'emphasis',
-        'text_collapse'
+        'fragments_join'
      ]
    }
  }
--- a/lib/presets/zero.js
+++ b/lib/presets/zero.js
@ -39,7 +39,8 @@ module.exports = {
      rules: [
        'normalize',
        'block',
-        'inline'
+        'inline',
+        'text_join'
      ]
    },

@ -55,7 +56,7 @@ module.exports = {
      ],
      rules2: [
        'balance_pairs',
-        'text_collapse'
+        'fragments_join'
      ]
    }
  }
--- a/lib/rules_core/text_join.js
+++ b/lib/rules_core/text_join.js
@ -0,0 +1,45 @@
+// Join raw text tokens with the rest of the text
+//
+// This is set as a separate rule to provide an opportunity for plugins
+// to run text replacements after text join, but before escape join.
+//
+// For example, `\:)` shouldn't be replaced with an emoji.
+//
+'use strict';
+
+
+module.exports = function text_join(state) {
+  var j, l, tokens, curr, max, last,
+      blockTokens = state.tokens;
+
+  for (j = 0, l = blockTokens.length; j < l; j++) {
+    if (blockTokens[j].type !== 'inline') continue;
+
+    tokens = blockTokens[j].children;
+    max = tokens.length;
+
+    for (curr = 0; curr < max; curr++) {
+      if (tokens[curr].type === 'text_special') {
+        tokens[curr].type = 'text';
+      }
+    }
+
+    for (curr = last = 0; curr < max; curr++) {
+      if (tokens[curr].type === 'text' &&
+          curr + 1 < max &&
+          tokens[curr + 1].type === 'text') {
+
+        // collapse two adjacent text nodes
+        tokens[curr + 1].content = tokens[curr].content + tokens[curr + 1].content;
+      } else {
+        if (curr !== last) { tokens[last] = tokens[curr]; }
+
+        last++;
+      }
+    }
+
+    if (curr !== last) {
+      tokens.length = last;
+    }
+  }
+};
--- a/lib/rules_inline/escape.js
+++ b/lib/rules_inline/escape.js
@ -13,40 +13,59 @@ for (var i = 0; i < 256; i++) { ESCAPED.push(0); }


 module.exports = function escape(state, silent) {
-  var ch, pos = state.pos, max = state.posMax;
-
-  if (state.src.charCodeAt(pos) !== 0x5C/* \ */) { return false; }
+  var ch1, ch2, origStr, escapedStr, token, pos = state.pos, max = state.posMax;

+  if (state.src.charCodeAt(pos) !== 0x5C/* \ */) return false;
  pos++;

-  if (pos < max) {
-    ch = state.src.charCodeAt(pos);
+  // '\' at the end of the inline block
+  if (pos >= max) return false;
+
+  ch1 = state.src.charCodeAt(pos);

-    if (ch < 256 && ESCAPED[ch] !== 0) {
-      if (!silent) { state.pending += state.src[pos]; }
-      state.pos += 2;
-      return true;
+  if (ch1 === 0x0A) {
+    if (!silent) {
+      state.push('hardbreak', 'br', 0);
    }

-    if (ch === 0x0A) {
-      if (!silent) {
-        state.push('hardbreak', 'br', 0);
-      }
+    pos++;
+    // skip leading whitespaces from next line
+    while (pos < max) {
+      ch1 = state.src.charCodeAt(pos);
+      if (!isSpace(ch1)) break;
+      pos++;
+    }
+
+    state.pos = pos;
+    return true;
+  }
+
+  escapedStr = state.src[pos];

+  if (ch1 >= 0xD800 && ch1 <= 0xDBFF && pos + 1 < max) {
+    ch2 = state.src.charCodeAt(pos + 1);
+
+    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+      escapedStr += state.src[pos + 1];
      pos++;
-      // skip leading whitespaces from next line
-      while (pos < max) {
-        ch = state.src.charCodeAt(pos);
-        if (!isSpace(ch)) { break; }
-        pos++;
-      }
-
-      state.pos = pos;
-      return true;
    }
  }

-  if (!silent) { state.pending += '\\'; }
-  state.pos++;
+  origStr = '\\' + escapedStr;
+
+  if (!silent) {
+    token = state.push('text_special', '', 0);
+
+    if (ch1 < 256 && ESCAPED[ch1] !== 0) {
+      token.content = escapedStr;
+    } else {
+      token.content = origStr;
+    }
+
+    token.markup = origStr;
+    token.info   = 'escape';
+  }
+
+  state.pos = pos + 1;
  return true;
 };
--- a/lib/rules_inline/fragments_join.js
+++ b/lib/rules_inline/fragments_join.js
@ -9,7 +9,7 @@
 'use strict';


-module.exports = function text_collapse(state) {
+module.exports = function fragments_join(state) {
  var curr, last,
      level = 0,
      tokens = state.tokens,
--- a/test/fixtures/markdown-it/smartquotes.txt
+++ b/test/fixtures/markdown-it/smartquotes.txt
@ -164,3 +164,16 @@ Should parse quotes adjacent to inline html, #677:
 <p>“test <br>”</p>
 <p>“<br> test”</p>
 .
+
+Should be escapable:
+.
+"foo"
+
+\"foo"
+
+"foo\"
+.
+<p>“foo”</p>
+<p>&quot;foo&quot;</p>
+<p>&quot;foo&quot;</p>
+.
--- a/test/fixtures/markdown-it/typographer.txt
+++ b/test/fixtures/markdown-it/typographer.txt
@ -60,6 +60,13 @@ dupes
 <p>!!! ??? ,</p>
 .

+copyright should be escapable
+.
+\(c)
+.
+<p>(c)</p>
+.
+

 dashes
 .
@ -80,6 +87,16 @@ markdownit--awesome
 <p>markdownit–awesome</p>
 .

+dashes should be escapable
+.
+foo \-- bar
+
+foo -\- bar
+.
+<p>foo -- bar</p>
+<p>foo -- bar</p>
+.
+
 regression tests for #624
 .
 1---2---3
--- a/test/misc.js
+++ b/test/misc.js
@ -254,6 +254,14 @@ describe('Misc', function () {
      md.render('# test\n\n - hello\n - world\n')
    );
  });
+
+  it('Should escape surrogate pairs (coverage)', function () {
+    var md = markdownit();
+
+    assert.strictEqual(md.render('\\\uD835\uDC9C'), '<p>\\\uD835\uDC9C</p>\n');
+    assert.strictEqual(md.render('\\\uD835x'), '<p>\\\uD835x</p>\n');
+    assert.strictEqual(md.render('\\\uD835'), '<p>\\\uD835</p>\n');
+  });
 });