Merge pull request #2932 from plotly/text-entities

alexcjohnson · web-flow · commit b3dd749bacb8 · 2018-09-04T11:55:45.000-04:00
improve pseudo-html entity conversion
diff --git a/src/constants/string_mappings.js b/src/constants/string_mappings.js
diff --git a/src/lib/html2unicode.js b/src/lib/html2unicode.js
@@ -10,7 +10,7 @@
 'use strict';
 
 var toSuperScript = require('superscript-text');
-var stringMappings = require('../constants/string_mappings');
+var fixEntities = require('./svg_text_utils').convertEntities;
 
 function fixSuperScript(x) {
     var idx = 0;
@@ -33,28 +33,6 @@ function stripTags(x) {
     return x.replace(/\<.*\>/g, '');
 }
 
-function fixEntities(x) {
-    var entityToUnicode = stringMappings.entityToUnicode;
-    var idx = 0;
-
-    while((idx = x.indexOf('&', idx)) >= 0) {
-        var nidx = x.indexOf(';', idx);
-        if(nidx < idx) {
-            idx += 1;
-            continue;
-        }
-
-        var entity = entityToUnicode[x.slice(idx + 1, nidx)];
-        if(entity) {
-            x = x.slice(0, idx) + entity + x.slice(nidx + 1);
-        } else {
-            x = x.slice(0, idx) + x.slice(nidx + 1);
-        }
-    }
-
-    return x;
-}
-
 function convertHTMLToUnicode(html) {
     return '' +
         fixEntities(
diff --git a/src/lib/svg_text_utils.js b/src/lib/svg_text_utils.js
@@ -15,7 +15,6 @@ var d3 = require('d3');
 
 var Lib = require('../lib');
 var xmlnsNamespaces = require('../constants/xmlns_namespaces');
-var stringMappings = require('../constants/string_mappings');
 var LINE_SPACING = require('../constants/alignment').LINE_SPACING;
 
 // text converter
@@ -223,13 +222,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':'];
 
 var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g');
 
-var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) {
-    return {
-        regExp: new RegExp('&' + k + ';', 'g'),
-        sub: stringMappings.entityToUnicode[k]
-    };
-});
-
 var NEWLINES = /(\r\n?|\n)/g;
 
 var SPLIT_TAGS = /(<[^<>]*>)/;
@@ -254,6 +246,14 @@ var BR_TAG = /<br(\s+.*)?>/i;
  *
  * Because we hack in other attributes with style (sub & sup), drop any trailing
  * semicolon in user-supplied styles so we can consistently append the tag-dependent style
+ *
+ * These are for tag attributes; Chrome anyway will convert entities in
+ * attribute values, but not in attribute names
+ * you can test this by for example:
+ * > p = document.createElement('p')
+ * > p.innerHTML = '<span styl&#x65;="font-color:r&#x65;d;">Hi</span>'
+ * > p.innerHTML
+ * <- '<span styl&#x65;="font-color:red;">Hi</span>'
  */
 var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i;
 var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i;
@@ -265,7 +265,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i;
 function getQuotedMatch(_str, re) {
     if(!_str) return null;
     var match = _str.match(re);
-    return match && (match[3] || match[4]);
+    var result = match && (match[3] || match[4]);
+    return result && convertEntities(result);
 }
 
 var COLORMATCH = /(^|;)\s*color:/;
@@ -276,19 +277,70 @@ exports.plainText = function(_str) {
     return (_str || '').replace(STRIP_TAGS, ' ');
 };
 
-function replaceFromMapObject(_str, list) {
-    if(!_str) return '';
+/*
+ * N.B. HTML entities are listed without the leading '&' and trailing ';'
+ * https://www.freeformatter.com/html-entities.html
+ *
+ * FWIW if we wanted to support the full set, it has 2261 entries:
+ * https://www.w3.org/TR/html5/entities.json
+ * though I notice that some of these are duplicates and/or are missing ";"
+ * eg: "&amp;", "&amp", "&AMP;", and "&AMP" all map to "&"
+ * We no longer need to include numeric entities here, these are now handled
+ * by String.fromCodePoint/fromCharCode
+ *
+ * Anyway the only ones that are really important to allow are the HTML special
+ * chars <, >, and &, because these ones can trigger special processing if not
+ * replaced by the corresponding entity.
+ */
+var entityToUnicode = {
+    mu: 'μ',
+    amp: '&',
+    lt: '<',
+    gt: '>',
+    nbsp: ' ',
+    times: '×',
+    plusmn: '±',
+    deg: '°'
+};
 
-    for(var i = 0; i < list.length; i++) {
-        var item = list[i];
-        _str = _str.replace(item.regExp, item.sub);
-    }
+// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the
+// ones we support use only lowercase. If we ever change that, update the regex.
+var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g;
+function convertEntities(_str) {
+    return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) {
+        var outChar;
+        if(innerMatch.charAt(0) === '#') {
+            // cannot use String.fromCodePoint in IE
+            outChar = fromCodePoint(
+                innerMatch.charAt(1) === 'x' ?
+                    parseInt(innerMatch.substr(2), 16) :
+                    parseInt(innerMatch.substr(1), 10)
+            );
+        }
+        else outChar = entityToUnicode[innerMatch];
 
-    return _str;
+        // as in regular HTML, if we didn't decode the entity just
+        // leave the raw text in place.
+        return outChar || fullMatch;
+    });
 }
-
-function convertEntities(_str) {
-    return replaceFromMapObject(_str, ENTITY_TO_UNICODE);
+exports.convertEntities = convertEntities;
+
+function fromCodePoint(code) {
+    // Don't allow overflow. In Chrome this turns into � but I feel like it's
+    // more useful to just not convert it at all.
+    if(code > 0x10FFFF) return;
+    var stringFromCodePoint = String.fromCodePoint;
+    if(stringFromCodePoint) return stringFromCodePoint(code);
+
+    // IE doesn't have String.fromCodePoint
+    // see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint
+    var stringFromCharCode = String.fromCharCode;
+    if(code <= 0xFFFF) return stringFromCharCode(code);
+    return stringFromCharCode(
+        (code >> 10) + 0xD7C0,
+        (code % 0x400) + 0xDC00
+    );
 }
 
 /*
@@ -302,15 +354,14 @@ function convertEntities(_str) {
  *   somewhat differently if it does, so just keep track of this when it happens.
  */
 function buildSVGText(containerNode, str) {
-    str = convertEntities(str)
-        /*
-         * Normalize behavior between IE and others wrt newlines and whitespace:pre
-         * this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
-         * Chrome and FF display \n, \r, or \r\n as a space in this mode.
-         * I feel like at some point we turned these into <br> but currently we don't so
-         * I'm just going to cement what we do now in Chrome and FF
-         */
-        .replace(NEWLINES, ' ');
+    /*
+     * Normalize behavior between IE and others wrt newlines and whitespace:pre
+     * this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
+     * Chrome and FF display \n, \r, or \r\n as a space in this mode.
+     * I feel like at some point we turned these into <br> but currently we don't so
+     * I'm just going to cement what we do now in Chrome and FF
+     */
+    str = str.replace(NEWLINES, ' ');
 
     var hasLink = false;
 
@@ -435,7 +486,7 @@ function buildSVGText(containerNode, str) {
             newLine();
         }
         else if(tagStyle === undefined) {
-            addTextNode(currentNode, parti);
+            addTextNode(currentNode, convertEntities(parti));
         }
         else {
             // tag - open or close
diff --git a/test/jasmine/tests/svg_text_utils_test.js b/test/jasmine/tests/svg_text_utils_test.js
@@ -8,6 +8,16 @@ describe('svg+text utils', function() {
 
     describe('convertToTspans', function() {
 
+        var stringFromCodePoint;
+
+        beforeAll(function() {
+            stringFromCodePoint = String.fromCodePoint;
+        });
+
+        afterEach(function() {
+            String.fromCodePoint = stringFromCodePoint;
+        });
+
         function mockTextSVGElement(txt) {
             return d3.select('body')
                 .append('svg')
@@ -300,16 +310,78 @@ describe('svg+text utils', function() {
                 '100 &times; 20 &plusmn; 0.5 &deg;'
             );
 
-            expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
+            expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
         });
 
         it('decodes some HTML entities in text (number case)', function() {
             var node = mockTextSVGElement(
-                '100&#956; &#28; &#60; 10 &#62; 0 &#160;' +
+                '100&#956; &#38; &#60; 10 &#62; 0 &#160;' +
                 '100 &#215; 20 &#177; 0.5 &#176;'
             );
 
-            expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
+            expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
+        });
+
+        it('decodes arbitrary decimal and hex number entities', function() {
+            var i = 0;
+            for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
+                var node = mockTextSVGElement(
+                    '&#x' + n.toString(16) +
+                    '; = &#' + n.toString() +
+                    '; = &#x' + n.toString(16).toUpperCase() + ';'
+                );
+                var char = String.fromCodePoint(n);
+                expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
+                i++;
+            }
+            // not really necessary to assert this, but we tested  355 characters,
+            // weighted toward the low end but continuing all the way to the
+            // end of the unicode definition
+            expect(i).toBe(355);
+        });
+
+        it('decodes arbitrary decimal and hex number entities (IE case)', function() {
+            // IE does not have String.fromCodePoint
+            String.fromCodePoint = undefined;
+            expect(String.fromCodePoint).toBeUndefined();
+
+            var i = 0;
+            for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
+                var node = mockTextSVGElement(
+                    '&#x' + n.toString(16) +
+                    '; = &#' + n.toString() +
+                    '; = &#x' + n.toString(16).toUpperCase() + ';'
+                );
+                var char = stringFromCodePoint(n);
+                expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
+                i++;
+            }
+            // not really necessary to assert this, but we tested  355 characters,
+            // weighted toward the low end but continuing all the way to the
+            // end of the unicode definition
+            expect(i).toBe(355);
+        });
+
+        it('does not decode entities prematurely', function() {
+            var testCases = [
+                '&lt;b>not bold</b&gt;',
+                '<b&gt;not bold</b&gt;',
+                '&lt;b>not bold&lt;/b>',
+                '<b&gt;not bold&lt;/b>',
+                '&lt;b&gt;not bold&lt;/b&gt;'
+            ];
+            testCases.forEach(function(testCase) {
+                var node = mockTextSVGElement(testCase);
+
+                expect(node.html()).toBe(
+                    '&lt;b&gt;not bold&lt;/b&gt;', testCase
+                );
+            });
+
+            var controlNode = mockTextSVGElement('<b>bold</b>');
+            expect(controlNode.html()).toBe(
+                '<tspan style="font-weight:bold">bold</tspan>'
+            );
         });
 
         it('supports superscript by itself', function() {