Add abbreviations

10 years ago · 2c286f5aed
5 changed files with 246 additions and 0 deletions
--- a/lib/parser_core.js
+++ b/lib/parser_core.js
@ -8,8 +8,10 @@ var Ruler  = require('./ruler');
 var _rules = [
  [ 'block',        require('./rules_core/block')        ],
  [ 'abbr',         require('./rules_core/abbr')         ],
  [ 'references',   require('./rules_core/references')   ],
  [ 'inline',       require('./rules_core/inline')       ],
  [ 'abbr2',        require('./rules_core/abbr2')        ],
  [ 'replacements', require('./rules_core/replacements') ],
  [ 'smartquotes',  require('./rules_core/smartquotes')  ],
  [ 'linkify',      require('./rules_core/linkify')      ]
--- a/lib/renderer.js
+++ b/lib/renderer.js
@ -71,6 +71,15 @@ function getBreak(tokens, idx) {
 var rules = {};
 rules.abbr_open = function (tokens, idx/*, options*/) {
  var title = tokens[idx].title ? (' title="' + escapeHtml(replaceEntities(tokens[idx].title)) + '"') : '';
  return '<abbr' + title + '>';
 };
 rules.abbr_close = function (/*tokens, idx, options*/) {
  return '</abbr>';
 };
 rules.blockquote_open = function (/*tokens, idx, options*/) {
  return '<blockquote>\n';
 };
--- a/lib/rules_core/abbr.js
+++ b/lib/rules_core/abbr.js
@ -0,0 +1,62 @@
 // Parse abbreviation definitions, i.e. `*[abbr]: description`
 //
 'use strict';
 var StateInline          = require('../rules_inline/state_inline');
 var parseLinkLabel       = require('../links').parseLinkLabel;
 function parseAbbr(str, parser, options, env) {
  var state, labelEnd, pos, max, label, title;
  if (str.charCodeAt(0) !== 0x2A/* * */) { return -1; }
  if (str.charCodeAt(1) !== 0x5B/* [ */) { return -1; }
  if (str.indexOf(']:') === -1) { return -1; }
  state = new StateInline(str, parser, options, env);
  labelEnd = parseLinkLabel(state, 1);
  if (labelEnd < 0 || str.charCodeAt(labelEnd + 1) !== 0x3A/* : */) { return -1; }
  max = state.posMax;
  // abbr title is always one line, so looking for ending "\n" here
  for (pos = labelEnd + 2; pos < max; pos++) {
    if (state.src.charCodeAt(pos) === 0x0A) { break; }
  }
  label = str.slice(2, labelEnd);
  title = str.slice(labelEnd + 2, pos).trim();
  if (title.length === 0) { return -1; }
  if (!env.abbreviations) { env.abbreviations = {}; }
  env.abbreviations[label] = env.abbreviations[label] || title;
  return pos;
 }
 module.exports = function abbr(state) {
  var tokens = state.tokens, i, l, content, pos;
  // Parse inlines
  for (i = 1, l = tokens.length - 1; i < l; i++) {
    if (tokens[i - 1].type === 'paragraph_open' &&
        tokens[i].type === 'inline' &&
        tokens[i + 1].type === 'paragraph_close') {
      content = tokens[i].content;
      while (content.length) {
        pos = parseAbbr(content, state.inline, state.options, state.env);
        if (pos < 0) { break; }
        content = content.slice(pos).trim();
      }
      tokens[i].content = content;
      if (!content.length) {
        tokens[i - 1].tight = true;
        tokens[i + 1].tight = true;
      }
    }
  }
 };
--- a/lib/rules_core/abbr2.js
+++ b/lib/rules_core/abbr2.js
@ -0,0 +1,86 @@
 // Enclose abbreviations in <abbr> tags
 //
 'use strict';
 var PUNCT_CHARS = ' \n()[]\'".,!?-';
 // from Google closure library
 // http://closure-library.googlecode.com/git-history/docs/local_closure_goog_string_string.js.source.html#line1021
 function regEscape(s) {
  return s.replace(/([-()\[\]{}+?*.$\^|,:#<!\\])/g, '\\$1');
 }
 module.exports = function abbr2(state) {
  var i, j, l, tokens, token, text, nodes, pos, level, reg, m, regText,
      blockTokens = state.tokens;
  if (!state.env.abbreviations) { return; }
  if (!state.env.abbrRegExp) {
    regText = '(^|[' + PUNCT_CHARS.split('').map(regEscape).join('') + '])'
            + '(' + Object.keys(state.env.abbreviations).sort(function (a, b) {
                      return b.length - a.length;
                    }).map(regEscape).join('|') + ')'
            + '($|[' + PUNCT_CHARS.split('').map(regEscape).join('') + '])';
    state.env.abbrRegExp = new RegExp(regText, 'g');
  }
  reg = state.env.abbrRegExp;
  for (j = 0, l = blockTokens.length; j < l; j++) {
    if (blockTokens[j].type !== 'inline') { continue; }
    tokens = blockTokens[j].children;
    // We scan from the end, to keep position when new tags added.
    for (i = tokens.length - 1; i >= 0; i--) {
      token = tokens[i];
      if (token.type !== 'text') { continue; }
      pos = 0;
      text = token.content;
      reg.lastIndex = 0;
      level = token.level;
      nodes = [];
      while ((m = reg.exec(text))) {
        if (reg.lastIndex > pos) {
          nodes.push({
            type: 'text',
            content: text.slice(pos, m.index + m[1].length),
            level: level
          });
        }
        nodes.push({
          type: 'abbr_open',
          title: state.env.abbreviations[m[2]],
          level: level++
        });
        nodes.push({
          type: 'text',
          content: m[2],
          level: level
        });
        nodes.push({
          type: 'abbr_close',
          level: --level
        });
        pos = reg.lastIndex - m[3].length;
      }
      if (!nodes.length) { continue; }
      if (pos < text.length) {
        nodes.push({
          type: 'text',
          content: text.slice(pos),
          level: level
        });
      }
      // replace current node
      blockTokens[j].children = tokens = [].concat(tokens.slice(0, i), nodes, tokens.slice(i + 1));
    }
  }
 };
--- a/test/fixtures/remarkable/abbr.txt
+++ b/test/fixtures/remarkable/abbr.txt
@ -0,0 +1,87 @@
 An example from php markdown readme:
 .
 *[HTML]: Hyper Text Markup Language
 *[W3C]:  World Wide Web Consortium
 The HTML specification
 is maintained by the W3C.
 .
 <p>The <abbr title="Hyper Text Markup Language">HTML</abbr> specification
 is maintained by the <abbr title="World Wide Web Consortium">W3C</abbr>.</p>
 .
 They can be multiline (see pandoc implementation). Not sure about newlines, but we should at least skip those definitions:
 .
 *[
 foo
 bar
 ]: desc
 foo
 .
 <p>foo</p>
 .
 They can contain arbitrary markup (see pandoc implementation):
 .
 *[`]:`]: foo
 \`]:\`
 .
 <p><abbr title="foo">`]:`</abbr></p>
 .
 Can contain matched brackets:
 .
 *[[abbr]]: foo
 [abbr]
 .
 <p><abbr title="foo">[abbr]</abbr></p>
 .
 No empty abbreviations:
 .
 *[foo]: 
 foo
 .
 <p>*[foo]:
 foo</p>
 .
 Intersecting abbreviations (first should match):
 .
 *[Bar Foo]: 123
 *[Foo Bar]: 456
 Foo Bar Foo
 Bar Foo Bar
 .
 <p><abbr title="456">Foo Bar</abbr> Foo</p>
 <p><abbr title="123">Bar Foo</abbr> Bar</p>
 .
 Don't bother with nested abbreviations (yet?):
 .
 *[JS]: javascript
 *[HTTP]: hyper text blah blah
 *[JS HTTP]: is awesome
 JS HTTP is a collection of low-level javascript HTTP-related modules
 .
 <p><abbr title="is awesome">JS HTTP</abbr> is a collection of low-level javascript <abbr title="hyper text blah blah">HTTP</abbr>-related modules</p>
 .
 Don't match the middle of the string:
 .
 *[foo]: blah
 *[bar]: blah
 foobar
 .
 <p>foobar</p>
 .