Fix surrogate pairs instead of processing encoder exceptions

10 years ago · 0fa09e9cd2
1 changed files with 34 additions and 8 deletions
--- a/lib/common/utils.js
+++ b/lib/common/utils.js
@ -130,6 +130,37 @@ function escapeHtml(str) {

 ////////////////////////////////////////////////////////////////////////////////

+var SURRORATE_TEST_RE   = /[\uD800-\uDFFF]/;
+var SURRORATE_SEARCH_RE = /[\uD800-\uDFFF]/g;
+
+function replaceBadSurrogate(ch, pos, orig) {
+  var code = ch.charCodeAt(0);
+
+  if (code >= 0xD800 && code <= 0xDBFF) {
+    // high surrogate
+    if (pos >= orig.length - 1) { return '\uFFFD'; }
+    code = orig.charCodeAt(pos + 1);
+    if (code < 0xDC00 || code > 0xDFFF) { return '\uFFFD'; }
+
+    return ch;
+  }
+
+  // low surrogate
+  if (pos === 0) { return '\uFFFD'; }
+  code = orig.charCodeAt(pos - 1);
+  if (code < 0xD900 || code > 0xDBFF) { return '\uFFFD'; }
+  return ch;
+}
+
+function fixBrokenSurrogates(str) {
+  if (!SURRORATE_TEST_RE.test(str)) { return str; }
+
+  return str.replace(SURRORATE_SEARCH_RE, replaceBadSurrogate);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+
 // Incoming link can be partially encoded. Convert possible combinations to
 // unified form.
 //
@ -148,14 +179,9 @@ function normalizeLink(url) {
    normalized = decodeURI(normalized);
  } catch (__) {}

-  // Encoder throws exception on broken unicode sequence.
-  // Kill suspicious data for the safety.
-  //
-  try {
-    return encodeURI(normalized);
-  } catch (__) {
-    return '';
-  }
+  // Encoder throws exception on broken surrogate pairs.
+  // Fix those first.
+  return encodeURI(fixBrokenSurrogates(normalized));
 }

 ////////////////////////////////////////////////////////////////////////////////