Update CommonMark spec to 0.28

8 years ago · 2959f8c27c
6 changed files with 785 additions and 702 deletions
--- a/lib/common/html_blocks.js
+++ b/lib/common/html_blocks.js
@ -55,10 +55,8 @@ module.exports = [
  'option',
  'p',
  'param',
-  'pre',
  'section',
  'source',
-  'title',
  'summary',
  'table',
  'tbody',
--- a/lib/helpers/parse_link_destination.js
+++ b/lib/helpers/parse_link_destination.js
@ -59,18 +59,18 @@ module.exports = function parseLinkDestination(str, pos, max) {

    if (code === 0x28 /* ( */) {
      level++;
-      if (level > 1) { break; }
    }

    if (code === 0x29 /* ) */) {
+      if (level === 0) { break; }
      level--;
-      if (level < 0) { break; }
    }

    pos++;
  }

  if (start === pos) { return result; }
+  if (level !== 0) { return result; }

  result.str = unescapeAll(str.slice(start, pos));
  result.lines = lines;
--- a/lib/rules_inline/emphasis.js
+++ b/lib/rules_inline/emphasis.js
@ -77,7 +77,7 @@ module.exports.postProcess = function emphasis(state) {
      delimiters = state.delimiters,
      max = state.delimiters.length;

-  for (i = 0; i < max; i++) {
+  for (i = max - 1; i >= 0; i--) {
    startDelim = delimiters[i];

    if (startDelim.marker !== 0x5F/* _ */ && startDelim.marker !== 0x2A/* * */) {
@ -91,16 +91,16 @@ module.exports.postProcess = function emphasis(state) {

    endDelim = delimiters[startDelim.end];

-    // If the next delimiter has the same marker and is adjacent to this one,
+    // If the previous delimiter has the same marker and is adjacent to this one,
    // merge those into one strong delimiter.
    //
    // `<em><em>whatever</em></em>` -> `<strong>whatever</strong>`
    //
-    isStrong = i + 1 < max &&
-               delimiters[i + 1].end === startDelim.end - 1 &&
-               delimiters[i + 1].token === startDelim.token + 1 &&
-               delimiters[startDelim.end - 1].token === endDelim.token - 1 &&
-               delimiters[i + 1].marker === startDelim.marker;
+    isStrong = i > 0 &&
+               delimiters[i - 1].end === startDelim.end + 1 &&
+               delimiters[i - 1].token === startDelim.token - 1 &&
+               delimiters[startDelim.end + 1].token === endDelim.token + 1 &&
+               delimiters[i - 1].marker === startDelim.marker;

    ch = String.fromCharCode(startDelim.marker);

@ -119,9 +119,9 @@ module.exports.postProcess = function emphasis(state) {
    token.content = '';

    if (isStrong) {
-      state.tokens[delimiters[i + 1].token].content = '';
-      state.tokens[delimiters[startDelim.end - 1].token].content = '';
-      i++;
+      state.tokens[delimiters[i - 1].token].content = '';
+      state.tokens[delimiters[startDelim.end + 1].token].content = '';
+      i--;
    }
  }
 };
--- a/test/fixtures/commonmark/good.txt
+++ b/test/fixtures/commonmark/good.txt
--- a/test/fixtures/commonmark/spec.txt
+++ b/test/fixtures/commonmark/spec.txt
@ -1,8 +1,8 @@
 ---
 title: CommonMark Spec
 author: John MacFarlane
-version: 0.27
-date: '2016-11-18'
+version: 0.28
+date: '2017-08-01'
 license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)'
 ...

@ -11,10 +11,12 @@ license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)'
 ## What is Markdown?

 Markdown is a plain text format for writing structured documents,
-based on conventions used for indicating formatting in email and
-usenet posts.  It was developed in 2004 by John Gruber, who wrote
-the first Markdown-to-HTML converter in Perl, and it soon became
-ubiquitous.  In the next decade, dozens of implementations were
+based on conventions for indicating formatting in email
+and usenet posts.  It was developed by John Gruber (with
+help from Aaron Swartz) and released in 2004 in the form of a
+[syntax description](http://daringfireball.net/projects/markdown/syntax)
+and a Perl script (`Markdown.pl`) for converting Markdown to
+HTML.  In the next decade, dozens of implementations were
 developed in many languages.  Some extended the original
 Markdown syntax with conventions for footnotes, tables, and
 other document elements.  Some allowed Markdown documents to be
@ -312,7 +314,7 @@ form feed (`U+000C`), or carriage return (`U+000D`).
 characters].

 A [Unicode whitespace character](@) is
-any code point in the Unicode `Zs` class, or a tab (`U+0009`),
+any code point in the Unicode `Zs` general category, or a tab (`U+0009`),
 carriage return (`U+000D`), newline (`U+000A`), or form feed
 (`U+000C`).

@ -331,7 +333,7 @@ is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`,

 A [punctuation character](@) is an [ASCII
 punctuation character] or anything in
-the Unicode classes `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`.
+the general Unicode categories  `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`.

 ## Tabs

@ -402,8 +404,8 @@ as indentation with four spaces would:
 Normally the `>` that begins a block quote may be followed
 optionally by a space, which is not considered part of the
 content.  In the following case `>` is followed by a tab,
-which is treated as if it were expanded into spaces.
-Since one of theses spaces is considered part of the
+which is treated as if it were expanded into three spaces.
+Since one of these spaces is considered part of the
 delimiter, `foo` is considered to be indented six spaces
 inside the block quote context, so we get an indented
 code block starting with two spaces.
@ -481,7 +483,7 @@ We can think of a document as a sequence of
 quotations, lists, headings, rules, and code blocks.  Some blocks (like
 block quotes and list items) contain other blocks; others (like
 headings and paragraphs) contain [inline](@) content---text,
-links, emphasized text, images, code, and so on.
+links, emphasized text, images, code spans, and so on.

 ## Precedence

@ -1643,6 +1645,15 @@ With tildes:
 </code></pre>
 ````````````````````````````````

+Fewer than three backticks is not enough:
+
+```````````````````````````````` example
+``
+foo
+``
+.
+<p><code>foo</code></p>
+````````````````````````````````

 The closing code fence must use the same character as the opening
 fence:
@ -2031,6 +2042,37 @@ or [closing tag] (with any [tag name] other than `script`,
 or the end of the line.\
 **End condition:** line is followed by a [blank line].

+HTML blocks continue until they are closed by their appropriate
+[end condition], or the last line of the document or other [container block].
+This means any HTML **within an HTML block** that might otherwise be recognised
+as a start condition will be ignored by the parser and passed through as-is,
+without changing the parser's state.
+
+For instance, `<pre>` within a HTML block started by `<table>` will not affect
+the parser state; as the HTML block was started in by start condition 6, it
+will end at any blank line. This can be surprising:
+
+```````````````````````````````` example
+<table><tr><td>
+<pre>
+**Hello**,
+
+_world_.
+</pre>
+</td></tr></table>
+.
+<table><tr><td>
+<pre>
+**Hello**,
+<p><em>world</em>.
+</pre></p>
+</td></tr></table>
+````````````````````````````````
+
+In this case, the HTML block is terminated by the newline — the `**hello**`
+text remains verbatim — and regular parsing resumes, with a paragraph,
+emphasised `world` and inline and block HTML following.
+
 All types of [HTML blocks] except type 7 may interrupt
 a paragraph.  Blocks of type 7 may not interrupt a paragraph.
 (This restriction is intended to prevent unwanted interpretation
@ -3637,11 +3679,15 @@ The following rules define [list items]:
    If the list item is ordered, then it is also assigned a start
    number, based on the ordered list marker.

-    Exceptions: When the first list item in a [list] interrupts
-    a paragraph---that is, when it starts on a line that would
-    otherwise count as [paragraph continuation text]---then (a)
-    the lines *Ls* must not begin with a blank line, and (b) if
-    the list item is ordered, the start number must be 1.
+    Exceptions:
+
+    1. When the first list item in a [list] interrupts
+       a paragraph---that is, when it starts on a line that would
+       otherwise count as [paragraph continuation text]---then (a)
+       the lines *Ls* must not begin with a blank line, and (b) if
+       the list item is ordered, the start number must be 1.
+    2. If any line is a [thematic break][thematic breaks] then
+       that line is not a list item.

 For example, let *Ls* be the lines

@ -5796,6 +5842,15 @@ we just have literal backticks:
 <p>`foo</p>
 ````````````````````````````````

+The following case also illustrates the need for opening and
+closing backtick strings to be equal in length:
+
+```````````````````````````````` example
+`foo``bar``
+.
+<p>`foo<code>bar</code></p>
+````````````````````````````````
+

 ## Emphasis and strong emphasis

@ -5845,19 +5900,20 @@ for efficient parsing strategies that do not backtrack.

 First, some definitions.  A [delimiter run](@) is either
 a sequence of one or more `*` characters that is not preceded or
-followed by a `*` character, or a sequence of one or more `_`
-characters that is not preceded or followed by a `_` character.
+followed by a non-backslash-escaped `*` character, or a sequence
+of one or more `_` characters that is not preceded or followed by
+a non-backslash-escaped `_` character.

 A [left-flanking delimiter run](@) is
 a [delimiter run] that is (a) not followed by [Unicode whitespace],
-and (b) either not followed by a [punctuation character], or
+and (b) not followed by a [punctuation character], or
 preceded by [Unicode whitespace] or a [punctuation character].
 For purposes of this definition, the beginning and the end of
 the line count as Unicode whitespace.

 A [right-flanking delimiter run](@) is
 a [delimiter run] that is (a) not preceded by [Unicode whitespace],
-and (b) either not preceded by a [punctuation character], or
+and (b) not preceded by a [punctuation character], or
 followed by [Unicode whitespace] or a [punctuation character].
 For purposes of this definition, the beginning and the end of
 the line count as Unicode whitespace.
@ -5936,7 +5992,7 @@ The following rules define emphasis and strong emphasis:
 7.  A double `**` [can close strong emphasis](@)
    iff it is part of a [right-flanking delimiter run].

-8.  A double `__` [can close strong emphasis]
+8.  A double `__` [can close strong emphasis] iff
    it is part of a [right-flanking delimiter run]
    and either (a) not part of a [left-flanking delimiter run]
    or (b) part of a [left-flanking delimiter run]
@ -5976,8 +6032,8 @@ the following principles resolve ambiguity:
    an interpretation `<strong>...</strong>` is always preferred to
    `<em><em>...</em></em>`.

-14. An interpretation `<strong><em>...</em></strong>` is always
-    preferred to `<em><strong>..</strong></em>`.
+14. An interpretation `<em><strong>...</strong></em>` is always
+    preferred to `<strong><em>...</em></strong>`.

 15. When two potential emphasis or strong emphasis spans overlap,
    so that the second begins before the first ends and ends after
@ -7000,14 +7056,14 @@ Rule 14:
 ```````````````````````````````` example
 ***foo***
 .
-<p><strong><em>foo</em></strong></p>
+<p><em><strong>foo</strong></em></p>
 ````````````````````````````````


 ```````````````````````````````` example
 _____foo_____
 .
-<p><strong><strong><em>foo</em></strong></strong></p>
+<p><em><strong><strong>foo</strong></strong></em></p>
 ````````````````````````````````


@ -7148,8 +7204,9 @@ A [link destination](@) consists of either
 - a nonempty sequence of characters that does not include
  ASCII space or control characters, and includes parentheses
  only if (a) they are backslash-escaped or (b) they are part of
-  a balanced pair of unescaped parentheses that is not itself
-  inside a balanced pair of unescaped parentheses.
+  a balanced pair of unescaped parentheses.  (Implementations
+  may impose limits on parentheses nesting to avoid performance
+  issues, but at least three levels of nesting should be supported.)

 A [link title](@)  consists of either

@ -7255,35 +7312,29 @@ Parentheses inside the link destination may be escaped:
 <p><a href="(foo)">link</a></p>
 ````````````````````````````````

-One level of balanced parentheses is allowed without escaping:
-
-```````````````````````````````` example
-[link]((foo)and(bar))
-.
-<p><a href="(foo)and(bar)">link</a></p>
-````````````````````````````````
-
-However, if you have parentheses within parentheses, you need to escape
-or use the `<...>` form:
+Any number of parentheses are allowed without escaping, as long as they are
+balanced:

 ```````````````````````````````` example
 [link](foo(and(bar)))
 .
-<p>[link](foo(and(bar)))</p>
+<p><a href="foo(and(bar))">link</a></p>
 ````````````````````````````````

+However, if you have unbalanced parentheses, you need to escape or use the
+`<...>` form:

 ```````````````````````````````` example
-[link](foo(and\(bar\)))
+[link](foo\(and\(bar\))
 .
-<p><a href="foo(and(bar))">link</a></p>
+<p><a href="foo(and(bar)">link</a></p>
 ````````````````````````````````


 ```````````````````````````````` example
-[link](<foo(and(bar))>)
+[link](<foo(and(bar)>)
 .
-<p><a href="foo(and(bar))">link</a></p>
+<p><a href="foo(and(bar)">link</a></p>
 ````````````````````````````````


@ -7567,13 +7618,16 @@ that [matches] a [link reference definition] elsewhere in the document.
 A [link label](@)  begins with a left bracket (`[`) and ends
 with the first right bracket (`]`) that is not backslash-escaped.
 Between these brackets there must be at least one [non-whitespace character].
-Unescaped square bracket characters are not allowed in
-[link labels].  A link label can have at most 999
-characters inside the square brackets.
+Unescaped square bracket characters are not allowed inside the
+opening and closing square brackets of [link labels].  A link
+label can have at most 999 characters inside the square
+brackets.

 One label [matches](@)
 another just in case their normalized forms are equal.  To normalize a
-label, perform the *Unicode case fold* and collapse consecutive internal
+label, strip off the opening and closing brackets,
+perform the *Unicode case fold*, strip leading and trailing
+[whitespace] and collapse consecutive internal
 [whitespace] to a single space.  If there are multiple
 matching reference link definitions, the one that comes first in the
 document is used.  (It is desirable in such cases to emit a warning.)
@ -8326,11 +8380,11 @@ The link labels are case-insensitive:
 ````````````````````````````````


-If you just want bracketed text, you can backslash-escape the
-opening `!` and `[`:
+If you just want a literal `!` followed by bracketed text, you can
+backslash-escape the opening `[`:

 ```````````````````````````````` example
-\!\[foo]
+!\[foo]

 [foo]: /url "title"
 .
--- a/test/misc.js
+++ b/test/misc.js
@ -203,8 +203,8 @@ describe('Misc', function () {

    md.enable('emphasis');

-    assert.strictEqual(md.render('___foo___'), '<p><strong><em>foo</em></strong></p>\n');
-    assert.strictEqual(md.renderInline('___foo___'), '<strong><em>foo</em></strong>');
+    assert.strictEqual(md.render('___foo___'), '<p><em><strong>foo</strong></em></p>\n');
+    assert.strictEqual(md.renderInline('___foo___'), '<em><strong>foo</strong></em>');
  });

  it('Should correctly check block termination rules when those are disabled (#13)', function () {