diff --git a/lib/common/utils.js b/lib/common/utils.js index 33bfcc4..712cd29 100644 --- a/lib/common/utils.js +++ b/lib/common/utils.js @@ -257,6 +257,31 @@ function normalizeReference(str) { // .toLowerCase().toUpperCase() should get rid of all differences // between letter variants. // + // Simple .toLowerCase() doesn't normalize 125 code points correctly, + // and .toUpperCase doesn't normalize 6 of them (list of exceptions: + // İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently + // uppercased versions). + // + // Here's an example showing how it happens. Lets take greek letter omega: + // uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) + // + // Unicode entries: + // 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8; + // 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 + // 03D1;GREEK THETA SYMBOL;Ll;0;L; 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 + // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L; 0398;;;;N;;;;03B8; + // + // Case-insensitive comparison should treat all of them as equivalent. + // + // But .toLowerCase() doesn't change ϑ (it's already lowercase), + // and .toUpperCase() doesn't change ϴ (already uppercase). + // + // Applying first lower then upper case normalizes any character: + // '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' + // + // Note: this is equivalent to unicode case folding; unicode normalization + // is a different step that is not required here. + // // Final result should be uppercased, because it's later stored in an object // (this avoid a conflict with Object.prototype members, // most notably, `__proto__`)